git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "common.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264data.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 #define interlaced_dct interlaced_dct_is_a_bad_name
  42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  43
  44 #define LUMA_DC_BLOCK_INDEX   25
  45 #define CHROMA_DC_BLOCK_INDEX 26
  46
  47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  48 #define COEFF_TOKEN_VLC_BITS           8
  49 #define TOTAL_ZEROS_VLC_BITS           9
  50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  51 #define RUN_VLC_BITS                   3
  52 #define RUN7_VLC_BITS                  6
  53
  54 #define MAX_SPS_COUNT 32
  55 #define MAX_PPS_COUNT 256
  56
  57 #define MAX_MMCO_COUNT 66
  58
  59 /* Compiling in interlaced support reduces the speed
  60  * of progressive decoding by about 2%. */
  61 #define ALLOW_INTERLACE
  62
  63 #ifdef ALLOW_INTERLACE
  64 #define MB_MBAFF h->mb_mbaff
  65 #define MB_FIELD h->mb_field_decoding_flag
  66 #define FRAME_MBAFF h->mb_aff_frame
  67 #else
  68 #define MB_MBAFF 0
  69 #define MB_FIELD 0
  70 #define FRAME_MBAFF 0
  71 #undef  IS_INTERLACED
  72 #define IS_INTERLACED(mb_type) 0
  73 #endif
  74
  75 /**
  76  * Sequence parameter set
  77  */
  78 typedef struct SPS{
  79
  80     int profile_idc;
  81     int level_idc;
  82     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  83     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  84     int poc_type;                      ///< pic_order_cnt_type
  85     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  86     int delta_pic_order_always_zero_flag;
  87     int offset_for_non_ref_pic;
  88     int offset_for_top_to_bottom_field;
  89     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  90     int ref_frame_count;               ///< num_ref_frames
  91     int gaps_in_frame_num_allowed_flag;
  92     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  93     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  94     int frame_mbs_only_flag;
  95     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  96     int direct_8x8_inference_flag;
  97     int crop;                   ///< frame_cropping_flag
  98     int crop_left;              ///< frame_cropping_rect_left_offset
  99     int crop_right;             ///< frame_cropping_rect_right_offset
 100     int crop_top;               ///< frame_cropping_rect_top_offset
 101     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 102     int vui_parameters_present_flag;
 103     AVRational sar;
 104     int timing_info_present_flag;
 105     uint32_t num_units_in_tick;
 106     uint32_t time_scale;
 107     int fixed_frame_rate_flag;
 108     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 109     int bitstream_restriction_flag;
 110     int num_reorder_frames;
 111     int scaling_matrix_present;
 112     uint8_t scaling_matrix4[6][16];
 113     uint8_t scaling_matrix8[2][64];
 114 }SPS;
 115
 116 /**
 117  * Picture parameter set
 118  */
 119 typedef struct PPS{
 120     int sps_id;
 121     int cabac;                  ///< entropy_coding_mode_flag
 122     int pic_order_present;      ///< pic_order_present_flag
 123     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 124     int mb_slice_group_map_type;
 125     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 126     int weighted_pred;          ///< weighted_pred_flag
 127     int weighted_bipred_idc;
 128     int init_qp;                ///< pic_init_qp_minus26 + 26
 129     int init_qs;                ///< pic_init_qs_minus26 + 26
 130     int chroma_qp_index_offset;
 131     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 132     int constrained_intra_pred; ///< constrained_intra_pred_flag
 133     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 134     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 135     uint8_t scaling_matrix4[6][16];
 136     uint8_t scaling_matrix8[2][64];
 137 }PPS;
 138
 139 /**
 140  * Memory management control operation opcode.
 141  */
 142 typedef enum MMCOOpcode{
 143     MMCO_END=0,
 144     MMCO_SHORT2UNUSED,
 145     MMCO_LONG2UNUSED,
 146     MMCO_SHORT2LONG,
 147     MMCO_SET_MAX_LONG,
 148     MMCO_RESET,
 149     MMCO_LONG,
 150 } MMCOOpcode;
 151
 152 /**
 153  * Memory management control operation.
 154  */
 155 typedef struct MMCO{
 156     MMCOOpcode opcode;
 157     int short_frame_num;
 158     int long_index;
 159 } MMCO;
 160
 161 /**
 162  * H264Context
 163  */
 164 typedef struct H264Context{
 165     MpegEncContext s;
 166     int nal_ref_idc;
 167     int nal_unit_type;
 168 #define NAL_SLICE                1
 169 #define NAL_DPA                  2
 170 #define NAL_DPB                  3
 171 #define NAL_DPC                  4
 172 #define NAL_IDR_SLICE            5
 173 #define NAL_SEI                  6
 174 #define NAL_SPS                  7
 175 #define NAL_PPS                  8
 176 #define NAL_AUD                  9
 177 #define NAL_END_SEQUENCE        10
 178 #define NAL_END_STREAM          11
 179 #define NAL_FILLER_DATA         12
 180 #define NAL_SPS_EXT             13
 181 #define NAL_AUXILIARY_SLICE     19
 182     uint8_t *rbsp_buffer;
 183     unsigned int rbsp_buffer_size;
 184
 185     /**
 186       * Used to parse AVC variant of h264
 187       */
 188     int is_avc; ///< this flag is != 0 if codec is avc1
 189     int got_avcC; ///< flag used to parse avcC data only once
 190     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 191
 192     int chroma_qp; //QPc
 193
 194     int prev_mb_skipped;
 195     int next_mb_skipped;
 196
 197     //prediction stuff
 198     int chroma_pred_mode;
 199     int intra16x16_pred_mode;
 200
 201     int top_mb_xy;
 202     int left_mb_xy[2];
 203
 204     int8_t intra4x4_pred_mode_cache[5*8];
 205     int8_t (*intra4x4_pred_mode)[8];
 206     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 207     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 208     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 209     void (*pred16x16[4+3])(uint8_t *src, int stride);
 210     unsigned int topleft_samples_available;
 211     unsigned int top_samples_available;
 212     unsigned int topright_samples_available;
 213     unsigned int left_samples_available;
 214     uint8_t (*top_borders[2])[16+2*8];
 215     uint8_t left_border[2*(17+2*9)];
 216
 217     /**
 218      * non zero coeff count cache.
 219      * is 64 if not available.
 220      */
 221     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 222     uint8_t (*non_zero_count)[16];
 223
 224     /**
 225      * Motion vector cache.
 226      */
 227     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 228     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 229 #define LIST_NOT_USED -1 //FIXME rename?
 230 #define PART_NOT_AVAILABLE -2
 231
 232     /**
 233      * is 1 if the specific list MV&references are set to 0,0,-2.
 234      */
 235     int mv_cache_clean[2];
 236
 237     /**
 238      * number of neighbors (top and/or left) that used 8x8 dct
 239      */
 240     int neighbor_transform_size;
 241
 242     /**
 243      * block_offset[ 0..23] for frame macroblocks
 244      * block_offset[24..47] for field macroblocks
 245      */
 246     int block_offset[2*(16+8)];
 247
 248     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 249     uint32_t *mb2b8_xy;
 250     int b_stride; //FIXME use s->b4_stride
 251     int b8_stride;
 252
 253     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 254     int mb_uvlinesize;
 255
 256     int emu_edge_width;
 257     int emu_edge_height;
 258
 259     int halfpel_flag;
 260     int thirdpel_flag;
 261
 262     int unknown_svq3_flag;
 263     int next_slice_index;
 264
 265     SPS sps_buffer[MAX_SPS_COUNT];
 266     SPS sps; ///< current sps
 267
 268     PPS pps_buffer[MAX_PPS_COUNT];
 269     /**
 270      * current pps
 271      */
 272     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 273
 274     uint32_t dequant4_buffer[6][52][16];
 275     uint32_t dequant8_buffer[2][52][64];
 276     uint32_t (*dequant4_coeff[6])[16];
 277     uint32_t (*dequant8_coeff[2])[64];
 278     int dequant_coeff_pps;     ///< reinit tables when pps changes
 279
 280     int slice_num;
 281     uint8_t *slice_table_base;
 282     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 283     int slice_type;
 284     int slice_type_fixed;
 285
 286     //interlacing specific flags
 287     int mb_aff_frame;
 288     int mb_field_decoding_flag;
 289     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 290
 291     int sub_mb_type[4];
 292
 293     //POC stuff
 294     int poc_lsb;
 295     int poc_msb;
 296     int delta_poc_bottom;
 297     int delta_poc[2];
 298     int frame_num;
 299     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 300     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 301     int frame_num_offset;         ///< for POC type 2
 302     int prev_frame_num_offset;    ///< for POC type 2
 303     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 304
 305     /**
 306      * frame_num for frames or 2*frame_num for field pics.
 307      */
 308     int curr_pic_num;
 309
 310     /**
 311      * max_frame_num or 2*max_frame_num for field pics.
 312      */
 313     int max_pic_num;
 314
 315     //Weighted pred stuff
 316     int use_weight;
 317     int use_weight_chroma;
 318     int luma_log2_weight_denom;
 319     int chroma_log2_weight_denom;
 320     int luma_weight[2][48];
 321     int luma_offset[2][48];
 322     int chroma_weight[2][48][2];
 323     int chroma_offset[2][48][2];
 324     int implicit_weight[48][48];
 325
 326     //deblock
 327     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 328     int slice_alpha_c0_offset;
 329     int slice_beta_offset;
 330
 331     int redundant_pic_count;
 332
 333     int direct_spatial_mv_pred;
 334     int dist_scale_factor[16];
 335     int dist_scale_factor_field[32];
 336     int map_col_to_list0[2][16];
 337     int map_col_to_list0_field[2][32];
 338
 339     /**
 340      * num_ref_idx_l0/1_active_minus1 + 1
 341      */
 342     int ref_count[2];            ///< counts frames or fields, depending on current mb mode
 343     Picture *short_ref[32];
 344     Picture *long_ref[32];
 345     Picture default_ref_list[2][32];
 346     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 347     Picture *delayed_pic[16]; //FIXME size?
 348     Picture *delayed_output_pic;
 349
 350     /**
 351      * memory management control operations buffer.
 352      */
 353     MMCO mmco[MAX_MMCO_COUNT];
 354     int mmco_index;
 355
 356     int long_ref_count;  ///< number of actual long term references
 357     int short_ref_count; ///< number of actual short term references
 358
 359     //data partitioning
 360     GetBitContext intra_gb;
 361     GetBitContext inter_gb;
 362     GetBitContext *intra_gb_ptr;
 363     GetBitContext *inter_gb_ptr;
 364
 365     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 366
 367     /**
 368      * Cabac
 369      */
 370     CABACContext cabac;
 371     uint8_t      cabac_state[460];
 372     int          cabac_init_idc;
 373
 374     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 375     uint16_t     *cbp_table;
 376     int cbp;
 377     int top_cbp;
 378     int left_cbp;
 379     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 380     uint8_t     *chroma_pred_mode_table;
 381     int         last_qscale_diff;
 382     int16_t     (*mvd_table[2])[2];
 383     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 384     uint8_t     *direct_table;
 385     uint8_t     direct_cache[5*8];
 386
 387     uint8_t zigzag_scan[16];
 388     uint8_t zigzag_scan8x8[64];
 389     uint8_t zigzag_scan8x8_cavlc[64];
 390     uint8_t field_scan[16];
 391     uint8_t field_scan8x8[64];
 392     uint8_t field_scan8x8_cavlc[64];
 393     const uint8_t *zigzag_scan_q0;
 394     const uint8_t *zigzag_scan8x8_q0;
 395     const uint8_t *zigzag_scan8x8_cavlc_q0;
 396     const uint8_t *field_scan_q0;
 397     const uint8_t *field_scan8x8_q0;
 398     const uint8_t *field_scan8x8_cavlc_q0;
 399
 400     int x264_build;
 401 }H264Context;
 402
 403 static VLC coeff_token_vlc[4];
 404 static VLC chroma_dc_coeff_token_vlc;
 405
 406 static VLC total_zeros_vlc[15];
 407 static VLC chroma_dc_total_zeros_vlc[3];
 408
 409 static VLC run_vlc[6];
 410 static VLC run7_vlc;
 411
 412 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 413 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 414 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 415 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 416
 417 static always_inline uint32_t pack16to32(int a, int b){
 418 #ifdef WORDS_BIGENDIAN
 419    return (b&0xFFFF) + (a<<16);
 420 #else
 421    return (a&0xFFFF) + (b<<16);
 422 #endif
 423 }
 424
 425 /**
 426  * fill a rectangle.
 427  * @param h height of the rectangle, should be a constant
 428  * @param w width of the rectangle, should be a constant
 429  * @param size the size of val (1 or 4), should be a constant
 430  */
 431 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 432     uint8_t *p= (uint8_t*)vp;
 433     assert(size==1 || size==4);
 434     assert(w<=4);
 435
 436     w      *= size;
 437     stride *= size;
 438
 439     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 440     assert((stride&(w-1))==0);
 441     if(w==2){
 442         const uint16_t v= size==4 ? val : val*0x0101;
 443         *(uint16_t*)(p + 0*stride)= v;
 444         if(h==1) return;
 445         *(uint16_t*)(p + 1*stride)= v;
 446         if(h==2) return;
 447         *(uint16_t*)(p + 2*stride)=
 448         *(uint16_t*)(p + 3*stride)= v;
 449     }else if(w==4){
 450         const uint32_t v= size==4 ? val : val*0x01010101;
 451         *(uint32_t*)(p + 0*stride)= v;
 452         if(h==1) return;
 453         *(uint32_t*)(p + 1*stride)= v;
 454         if(h==2) return;
 455         *(uint32_t*)(p + 2*stride)=
 456         *(uint32_t*)(p + 3*stride)= v;
 457     }else if(w==8){
 458     //gcc can't optimize 64bit math on x86_32
 459 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 460         const uint64_t v= val*0x0100000001ULL;
 461         *(uint64_t*)(p + 0*stride)= v;
 462         if(h==1) return;
 463         *(uint64_t*)(p + 1*stride)= v;
 464         if(h==2) return;
 465         *(uint64_t*)(p + 2*stride)=
 466         *(uint64_t*)(p + 3*stride)= v;
 467     }else if(w==16){
 468         const uint64_t v= val*0x0100000001ULL;
 469         *(uint64_t*)(p + 0+0*stride)=
 470         *(uint64_t*)(p + 8+0*stride)=
 471         *(uint64_t*)(p + 0+1*stride)=
 472         *(uint64_t*)(p + 8+1*stride)= v;
 473         if(h==2) return;
 474         *(uint64_t*)(p + 0+2*stride)=
 475         *(uint64_t*)(p + 8+2*stride)=
 476         *(uint64_t*)(p + 0+3*stride)=
 477         *(uint64_t*)(p + 8+3*stride)= v;
 478 #else
 479         *(uint32_t*)(p + 0+0*stride)=
 480         *(uint32_t*)(p + 4+0*stride)= val;
 481         if(h==1) return;
 482         *(uint32_t*)(p + 0+1*stride)=
 483         *(uint32_t*)(p + 4+1*stride)= val;
 484         if(h==2) return;
 485         *(uint32_t*)(p + 0+2*stride)=
 486         *(uint32_t*)(p + 4+2*stride)=
 487         *(uint32_t*)(p + 0+3*stride)=
 488         *(uint32_t*)(p + 4+3*stride)= val;
 489     }else if(w==16){
 490         *(uint32_t*)(p + 0+0*stride)=
 491         *(uint32_t*)(p + 4+0*stride)=
 492         *(uint32_t*)(p + 8+0*stride)=
 493         *(uint32_t*)(p +12+0*stride)=
 494         *(uint32_t*)(p + 0+1*stride)=
 495         *(uint32_t*)(p + 4+1*stride)=
 496         *(uint32_t*)(p + 8+1*stride)=
 497         *(uint32_t*)(p +12+1*stride)= val;
 498         if(h==2) return;
 499         *(uint32_t*)(p + 0+2*stride)=
 500         *(uint32_t*)(p + 4+2*stride)=
 501         *(uint32_t*)(p + 8+2*stride)=
 502         *(uint32_t*)(p +12+2*stride)=
 503         *(uint32_t*)(p + 0+3*stride)=
 504         *(uint32_t*)(p + 4+3*stride)=
 505         *(uint32_t*)(p + 8+3*stride)=
 506         *(uint32_t*)(p +12+3*stride)= val;
 507 #endif
 508     }else
 509         assert(0);
 510     assert(h==4);
 511 }
 512
 513 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 514     MpegEncContext * const s = &h->s;
 515     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 516     int topleft_xy, top_xy, topright_xy, left_xy[2];
 517     int topleft_type, top_type, topright_type, left_type[2];
 518     int left_block[8];
 519     int i;
 520
 521     //FIXME deblocking could skip the intra and nnz parts.
 522     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 523         return;
 524
 525     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 526
 527     top_xy     = mb_xy  - s->mb_stride;
 528     topleft_xy = top_xy - 1;
 529     topright_xy= top_xy + 1;
 530     left_xy[1] = left_xy[0] = mb_xy-1;
 531     left_block[0]= 0;
 532     left_block[1]= 1;
 533     left_block[2]= 2;
 534     left_block[3]= 3;
 535     left_block[4]= 7;
 536     left_block[5]= 10;
 537     left_block[6]= 8;
 538     left_block[7]= 11;
 539     if(FRAME_MBAFF){
 540         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 541         const int top_pair_xy      = pair_xy     - s->mb_stride;
 542         const int topleft_pair_xy  = top_pair_xy - 1;
 543         const int topright_pair_xy = top_pair_xy + 1;
 544         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 545         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 546         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 547         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 548         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 549         const int bottom = (s->mb_y & 1);
 550         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 551         if (bottom
 552                 ? !curr_mb_frame_flag // bottom macroblock
 553                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 554                 ) {
 555             top_xy -= s->mb_stride;
 556         }
 557         if (bottom
 558                 ? !curr_mb_frame_flag // bottom macroblock
 559                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 560                 ) {
 561             topleft_xy -= s->mb_stride;
 562         }
 563         if (bottom
 564                 ? !curr_mb_frame_flag // bottom macroblock
 565                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 566                 ) {
 567             topright_xy -= s->mb_stride;
 568         }
 569         if (left_mb_frame_flag != curr_mb_frame_flag) {
 570             left_xy[1] = left_xy[0] = pair_xy - 1;
 571             if (curr_mb_frame_flag) {
 572                 if (bottom) {
 573                     left_block[0]= 2;
 574                     left_block[1]= 2;
 575                     left_block[2]= 3;
 576                     left_block[3]= 3;
 577                     left_block[4]= 8;
 578                     left_block[5]= 11;
 579                     left_block[6]= 8;
 580                     left_block[7]= 11;
 581                 } else {
 582                     left_block[0]= 0;
 583                     left_block[1]= 0;
 584                     left_block[2]= 1;
 585                     left_block[3]= 1;
 586                     left_block[4]= 7;
 587                     left_block[5]= 10;
 588                     left_block[6]= 7;
 589                     left_block[7]= 10;
 590                 }
 591             } else {
 592                 left_xy[1] += s->mb_stride;
 593                 //left_block[0]= 0;
 594                 left_block[1]= 2;
 595                 left_block[2]= 0;
 596                 left_block[3]= 2;
 597                 //left_block[4]= 7;
 598                 left_block[5]= 10;
 599                 left_block[6]= 7;
 600                 left_block[7]= 10;
 601             }
 602         }
 603     }
 604
 605     h->top_mb_xy = top_xy;
 606     h->left_mb_xy[0] = left_xy[0];
 607     h->left_mb_xy[1] = left_xy[1];
 608     if(for_deblock){
 609         topleft_type = 0;
 610         topright_type = 0;
 611         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 612         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 613         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 614
 615         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 616             int list;
 617             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 618             for(i=0; i<16; i++)
 619                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 620             for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 621                 if(USES_LIST(mb_type,list)){
 622                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 623                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 624                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 625                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 626                         dst[0] = src[0];
 627                         dst[1] = src[1];
 628                         dst[2] = src[2];
 629                         dst[3] = src[3];
 630                     }
 631                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 632                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 633                     ref += h->b8_stride;
 634                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 635                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 636                 }else{
 637                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 638                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 639                 }
 640             }
 641         }
 642     }else{
 643         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 644         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 645         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 646         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 647         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 648     }
 649
 650     if(IS_INTRA(mb_type)){
 651         h->topleft_samples_available=
 652         h->top_samples_available=
 653         h->left_samples_available= 0xFFFF;
 654         h->topright_samples_available= 0xEEEA;
 655
 656         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 657             h->topleft_samples_available= 0xB3FF;
 658             h->top_samples_available= 0x33FF;
 659             h->topright_samples_available= 0x26EA;
 660         }
 661         for(i=0; i<2; i++){
 662             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 663                 h->topleft_samples_available&= 0xDF5F;
 664                 h->left_samples_available&= 0x5F5F;
 665             }
 666         }
 667
 668         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 669             h->topleft_samples_available&= 0x7FFF;
 670
 671         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 672             h->topright_samples_available&= 0xFBFF;
 673
 674         if(IS_INTRA4x4(mb_type)){
 675             if(IS_INTRA4x4(top_type)){
 676                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 677                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 678                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 679                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 680             }else{
 681                 int pred;
 682                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 683                     pred= -1;
 684                 else{
 685                     pred= 2;
 686                 }
 687                 h->intra4x4_pred_mode_cache[4+8*0]=
 688                 h->intra4x4_pred_mode_cache[5+8*0]=
 689                 h->intra4x4_pred_mode_cache[6+8*0]=
 690                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 691             }
 692             for(i=0; i<2; i++){
 693                 if(IS_INTRA4x4(left_type[i])){
 694                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 695                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 696                 }else{
 697                     int pred;
 698                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 699                         pred= -1;
 700                     else{
 701                         pred= 2;
 702                     }
 703                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 704                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 705                 }
 706             }
 707         }
 708     }
 709
 710
 711 /*
 712 0 . T T. T T T T
 713 1 L . .L . . . .
 714 2 L . .L . . . .
 715 3 . T TL . . . .
 716 4 L . .L . . . .
 717 5 L . .. . . . .
 718 */
 719 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 720     if(top_type){
 721         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 722         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 723         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 724         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 725
 726         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 727         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 728
 729         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 730         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 731
 732     }else{
 733         h->non_zero_count_cache[4+8*0]=
 734         h->non_zero_count_cache[5+8*0]=
 735         h->non_zero_count_cache[6+8*0]=
 736         h->non_zero_count_cache[7+8*0]=
 737
 738         h->non_zero_count_cache[1+8*0]=
 739         h->non_zero_count_cache[2+8*0]=
 740
 741         h->non_zero_count_cache[1+8*3]=
 742         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 743
 744     }
 745
 746     for (i=0; i<2; i++) {
 747         if(left_type[i]){
 748             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 749             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 750             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 751             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 752         }else{
 753             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 754             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 755             h->non_zero_count_cache[0+8*1 +   8*i]=
 756             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 757         }
 758     }
 759
 760     if( h->pps.cabac ) {
 761         // top_cbp
 762         if(top_type) {
 763             h->top_cbp = h->cbp_table[top_xy];
 764         } else if(IS_INTRA(mb_type)) {
 765             h->top_cbp = 0x1C0;
 766         } else {
 767             h->top_cbp = 0;
 768         }
 769         // left_cbp
 770         if (left_type[0]) {
 771             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 772         } else if(IS_INTRA(mb_type)) {
 773             h->left_cbp = 0x1C0;
 774         } else {
 775             h->left_cbp = 0;
 776         }
 777         if (left_type[0]) {
 778             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 779         }
 780         if (left_type[1]) {
 781             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 782         }
 783     }
 784
 785 #if 1
 786     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 787         int list;
 788         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 789             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 790                 /*if(!h->mv_cache_clean[list]){
 791                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 792                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 793                     h->mv_cache_clean[list]= 1;
 794                 }*/
 795                 continue;
 796             }
 797             h->mv_cache_clean[list]= 0;
 798
 799             if(USES_LIST(top_type, list)){
 800                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 801                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 802                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 803                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 804                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 805                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 806                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 807                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 808                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 809                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 810             }else{
 811                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 812                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 813                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 814                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 815                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 816             }
 817
 818             //FIXME unify cleanup or sth
 819             if(USES_LIST(left_type[0], list)){
 820                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 821                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 822                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 823                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 824                 h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 825                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
 826             }else{
 827                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 828                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 829                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 830                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 831             }
 832
 833             if(USES_LIST(left_type[1], list)){
 834                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 835                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 836                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 837                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 838                 h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 839                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
 840             }else{
 841                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 842                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 843                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 844                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 845                 assert((!left_type[0]) == (!left_type[1]));
 846             }
 847
 848             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 849                 continue;
 850
 851             if(USES_LIST(topleft_type, list)){
 852                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 853                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 854                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 855                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 856             }else{
 857                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 858                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 859             }
 860
 861             if(USES_LIST(topright_type, list)){
 862                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 863                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 864                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 865                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 866             }else{
 867                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 868                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 869             }
 870
 871             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 872                 continue;
 873
 874             h->ref_cache[list][scan8[5 ]+1] =
 875             h->ref_cache[list][scan8[7 ]+1] =
 876             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 877             h->ref_cache[list][scan8[4 ]] =
 878             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 879             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 880             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 881             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 882             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 883             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 884
 885             if( h->pps.cabac ) {
 886                 /* XXX beurk, Load mvd */
 887                 if(USES_LIST(top_type, list)){
 888                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 889                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 890                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 891                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 892                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 893                 }else{
 894                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 895                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 896                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 897                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 898                 }
 899                 if(USES_LIST(left_type[0], list)){
 900                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 901                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 902                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 903                 }else{
 904                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 905                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 906                 }
 907                 if(USES_LIST(left_type[1], list)){
 908                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 909                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 910                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 911                 }else{
 912                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 913                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 914                 }
 915                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 916                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 917                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 918                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 919                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 920
 921                 if(h->slice_type == B_TYPE){
 922                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 923
 924                     if(IS_DIRECT(top_type)){
 925                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 926                     }else if(IS_8X8(top_type)){
 927                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 928                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 929                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 930                     }else{
 931                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 932                     }
 933
 934                     if(IS_DIRECT(left_type[0]))
 935                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 936                     else if(IS_8X8(left_type[0]))
 937                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 938                     else
 939                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 940
 941                     if(IS_DIRECT(left_type[1]))
 942                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 943                     else if(IS_8X8(left_type[1]))
 944                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 945                     else
 946                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 947                 }
 948             }
 949
 950             if(FRAME_MBAFF){
 951 #define MAP_MVS\
 952                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 953                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 954                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 955                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 956                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 957                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 958                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 959                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 960                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 961                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 962                 if(MB_FIELD){
 963 #define MAP_F2F(idx, mb_type)\
 964                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 965                         h->ref_cache[list][idx] <<= 1;\
 966                         h->mv_cache[list][idx][1] /= 2;\
 967                         h->mvd_cache[list][idx][1] /= 2;\
 968                     }
 969                     MAP_MVS
 970 #undef MAP_F2F
 971                 }else{
 972 #define MAP_F2F(idx, mb_type)\
 973                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 974                         h->ref_cache[list][idx] >>= 1;\
 975                         h->mv_cache[list][idx][1] <<= 1;\
 976                         h->mvd_cache[list][idx][1] <<= 1;\
 977                     }
 978                     MAP_MVS
 979 #undef MAP_F2F
 980                 }
 981             }
 982         }
 983     }
 984 #endif
 985
 986     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 987 }
 988
 989 static inline void write_back_intra_pred_mode(H264Context *h){
 990     MpegEncContext * const s = &h->s;
 991     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 992
 993     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 994     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 995     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 996     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 997     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 998     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 999     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
1000 }
1001
1002 /**
1003  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1004  */
1005 static inline int check_intra4x4_pred_mode(H264Context *h){
1006     MpegEncContext * const s = &h->s;
1007     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1008     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1009     int i;
1010
1011     if(!(h->top_samples_available&0x8000)){
1012         for(i=0; i<4; i++){
1013             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1014             if(status<0){
1015                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1016                 return -1;
1017             } else if(status){
1018                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1019             }
1020         }
1021     }
1022
1023     if(!(h->left_samples_available&0x8000)){
1024         for(i=0; i<4; i++){
1025             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1026             if(status<0){
1027                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1028                 return -1;
1029             } else if(status){
1030                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1031             }
1032         }
1033     }
1034
1035     return 0;
1036 } //FIXME cleanup like next
1037
1038 /**
1039  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1040  */
1041 static inline int check_intra_pred_mode(H264Context *h, int mode){
1042     MpegEncContext * const s = &h->s;
1043     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1044     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1045
1046     if(mode < 0 || mode > 6) {
1047         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1048         return -1;
1049     }
1050
1051     if(!(h->top_samples_available&0x8000)){
1052         mode= top[ mode ];
1053         if(mode<0){
1054             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1055             return -1;
1056         }
1057     }
1058
1059     if(!(h->left_samples_available&0x8000)){
1060         mode= left[ mode ];
1061         if(mode<0){
1062             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1063             return -1;
1064         }
1065     }
1066
1067     return mode;
1068 }
1069
1070 /**
1071  * gets the predicted intra4x4 prediction mode.
1072  */
1073 static inline int pred_intra_mode(H264Context *h, int n){
1074     const int index8= scan8[n];
1075     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1076     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1077     const int min= FFMIN(left, top);
1078
1079     tprintf("mode:%d %d min:%d\n", left ,top, min);
1080
1081     if(min<0) return DC_PRED;
1082     else      return min;
1083 }
1084
1085 static inline void write_back_non_zero_count(H264Context *h){
1086     MpegEncContext * const s = &h->s;
1087     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1088
1089     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1090     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1091     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1092     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1093     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1094     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1095     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1096
1097     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1098     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1099     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1100
1101     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1102     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1103     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1104
1105     if(FRAME_MBAFF){
1106         // store all luma nnzs, for deblocking
1107         int v = 0, i;
1108         for(i=0; i<16; i++)
1109             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1110         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1111     }
1112 }
1113
1114 /**
1115  * gets the predicted number of non zero coefficients.
1116  * @param n block index
1117  */
1118 static inline int pred_non_zero_count(H264Context *h, int n){
1119     const int index8= scan8[n];
1120     const int left= h->non_zero_count_cache[index8 - 1];
1121     const int top = h->non_zero_count_cache[index8 - 8];
1122     int i= left + top;
1123
1124     if(i<64) i= (i+1)>>1;
1125
1126     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1127
1128     return i&31;
1129 }
1130
1131 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1132     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1133
1134     /* there is no consistent mapping of mvs to neighboring locations that will
1135      * make mbaff happy, so we can't move all this logic to fill_caches */
1136     if(FRAME_MBAFF){
1137         MpegEncContext *s = &h->s;
1138         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1139         const int16_t *mv;
1140         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1141         *C = h->mv_cache[list][scan8[0]-2];
1142
1143         if(!MB_FIELD
1144            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1145             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1146             if(IS_INTERLACED(mb_types[topright_xy])){
1147 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1148                 const int x4 = X4, y4 = Y4;\
1149                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1150                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1151                     return LIST_NOT_USED;\
1152                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1153                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1154                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1155                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1156
1157                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1158             }
1159         }
1160         if(topright_ref == PART_NOT_AVAILABLE
1161            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1162            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1163             if(!MB_FIELD
1164                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1165                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1166             }
1167             if(MB_FIELD
1168                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1169                && i >= scan8[0]+8){
1170                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1171                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1172             }
1173         }
1174 #undef SET_DIAG_MV
1175     }
1176
1177     if(topright_ref != PART_NOT_AVAILABLE){
1178         *C= h->mv_cache[list][ i - 8 + part_width ];
1179         return topright_ref;
1180     }else{
1181         tprintf("topright MV not available\n");
1182
1183         *C= h->mv_cache[list][ i - 8 - 1 ];
1184         return h->ref_cache[list][ i - 8 - 1 ];
1185     }
1186 }
1187
1188 /**
1189  * gets the predicted MV.
1190  * @param n the block index
1191  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1192  * @param mx the x component of the predicted motion vector
1193  * @param my the y component of the predicted motion vector
1194  */
1195 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1196     const int index8= scan8[n];
1197     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1198     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1199     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1200     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1201     const int16_t * C;
1202     int diagonal_ref, match_count;
1203
1204     assert(part_width==1 || part_width==2 || part_width==4);
1205
1206 /* mv_cache
1207   B . . A T T T T
1208   U . . L . . , .
1209   U . . L . . . .
1210   U . . L . . , .
1211   . . . L . . . .
1212 */
1213
1214     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1215     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1216     tprintf("pred_motion match_count=%d\n", match_count);
1217     if(match_count > 1){ //most common
1218         *mx= mid_pred(A[0], B[0], C[0]);
1219         *my= mid_pred(A[1], B[1], C[1]);
1220     }else if(match_count==1){
1221         if(left_ref==ref){
1222             *mx= A[0];
1223             *my= A[1];
1224         }else if(top_ref==ref){
1225             *mx= B[0];
1226             *my= B[1];
1227         }else{
1228             *mx= C[0];
1229             *my= C[1];
1230         }
1231     }else{
1232         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1233             *mx= A[0];
1234             *my= A[1];
1235         }else{
1236             *mx= mid_pred(A[0], B[0], C[0]);
1237             *my= mid_pred(A[1], B[1], C[1]);
1238         }
1239     }
1240
1241     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1242 }
1243
1244 /**
1245  * gets the directionally predicted 16x8 MV.
1246  * @param n the block index
1247  * @param mx the x component of the predicted motion vector
1248  * @param my the y component of the predicted motion vector
1249  */
1250 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1251     if(n==0){
1252         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1253         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1254
1255         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1256
1257         if(top_ref == ref){
1258             *mx= B[0];
1259             *my= B[1];
1260             return;
1261         }
1262     }else{
1263         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1264         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1265
1266         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1267
1268         if(left_ref == ref){
1269             *mx= A[0];
1270             *my= A[1];
1271             return;
1272         }
1273     }
1274
1275     //RARE
1276     pred_motion(h, n, 4, list, ref, mx, my);
1277 }
1278
1279 /**
1280  * gets the directionally predicted 8x16 MV.
1281  * @param n the block index
1282  * @param mx the x component of the predicted motion vector
1283  * @param my the y component of the predicted motion vector
1284  */
1285 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1286     if(n==0){
1287         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1288         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1289
1290         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1291
1292         if(left_ref == ref){
1293             *mx= A[0];
1294             *my= A[1];
1295             return;
1296         }
1297     }else{
1298         const int16_t * C;
1299         int diagonal_ref;
1300
1301         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1302
1303         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1304
1305         if(diagonal_ref == ref){
1306             *mx= C[0];
1307             *my= C[1];
1308             return;
1309         }
1310     }
1311
1312     //RARE
1313     pred_motion(h, n, 2, list, ref, mx, my);
1314 }
1315
1316 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1317     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1318     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1319
1320     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1321
1322     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1323        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1324        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1325
1326         *mx = *my = 0;
1327         return;
1328     }
1329
1330     pred_motion(h, 0, 4, 0, 0, mx, my);
1331
1332     return;
1333 }
1334
1335 static inline void direct_dist_scale_factor(H264Context * const h){
1336     const int poc = h->s.current_picture_ptr->poc;
1337     const int poc1 = h->ref_list[1][0].poc;
1338     int i;
1339     for(i=0; i<h->ref_count[0]; i++){
1340         int poc0 = h->ref_list[0][i].poc;
1341         int td = clip(poc1 - poc0, -128, 127);
1342         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1343             h->dist_scale_factor[i] = 256;
1344         }else{
1345             int tb = clip(poc - poc0, -128, 127);
1346             int tx = (16384 + (FFABS(td) >> 1)) / td;
1347             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1348         }
1349     }
1350     if(FRAME_MBAFF){
1351         for(i=0; i<h->ref_count[0]; i++){
1352             h->dist_scale_factor_field[2*i] =
1353             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1354         }
1355     }
1356 }
1357 static inline void direct_ref_list_init(H264Context * const h){
1358     MpegEncContext * const s = &h->s;
1359     Picture * const ref1 = &h->ref_list[1][0];
1360     Picture * const cur = s->current_picture_ptr;
1361     int list, i, j;
1362     if(cur->pict_type == I_TYPE)
1363         cur->ref_count[0] = 0;
1364     if(cur->pict_type != B_TYPE)
1365         cur->ref_count[1] = 0;
1366     for(list=0; list<2; list++){
1367         cur->ref_count[list] = h->ref_count[list];
1368         for(j=0; j<h->ref_count[list]; j++)
1369             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1370     }
1371     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1372         return;
1373     for(list=0; list<2; list++){
1374         for(i=0; i<ref1->ref_count[list]; i++){
1375             const int poc = ref1->ref_poc[list][i];
1376             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1377             for(j=0; j<h->ref_count[list]; j++)
1378                 if(h->ref_list[list][j].poc == poc){
1379                     h->map_col_to_list0[list][i] = j;
1380                     break;
1381                 }
1382         }
1383     }
1384     if(FRAME_MBAFF){
1385         for(list=0; list<2; list++){
1386             for(i=0; i<ref1->ref_count[list]; i++){
1387                 j = h->map_col_to_list0[list][i];
1388                 h->map_col_to_list0_field[list][2*i] = 2*j;
1389                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1390             }
1391         }
1392     }
1393 }
1394
1395 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1396     MpegEncContext * const s = &h->s;
1397     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1398     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1399     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1400     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1401     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1402     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1403     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1404     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1405     const int is_b8x8 = IS_8X8(*mb_type);
1406     int sub_mb_type;
1407     int i8, i4;
1408
1409 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1410     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1411         /* FIXME save sub mb types from previous frames (or derive from MVs)
1412          * so we know exactly what block size to use */
1413         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1414         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1415     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1416         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1417         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1418     }else{
1419         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1420         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1421     }
1422     if(!is_b8x8)
1423         *mb_type |= MB_TYPE_DIRECT2;
1424     if(MB_FIELD)
1425         *mb_type |= MB_TYPE_INTERLACED;
1426
1427     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1428
1429     if(h->direct_spatial_mv_pred){
1430         int ref[2];
1431         int mv[2][2];
1432         int list;
1433
1434         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1435
1436         /* ref = min(neighbors) */
1437         for(list=0; list<2; list++){
1438             int refa = h->ref_cache[list][scan8[0] - 1];
1439             int refb = h->ref_cache[list][scan8[0] - 8];
1440             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1441             if(refc == -2)
1442                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1443             ref[list] = refa;
1444             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1445                 ref[list] = refb;
1446             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1447                 ref[list] = refc;
1448             if(ref[list] < 0)
1449                 ref[list] = -1;
1450         }
1451
1452         if(ref[0] < 0 && ref[1] < 0){
1453             ref[0] = ref[1] = 0;
1454             mv[0][0] = mv[0][1] =
1455             mv[1][0] = mv[1][1] = 0;
1456         }else{
1457             for(list=0; list<2; list++){
1458                 if(ref[list] >= 0)
1459                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1460                 else
1461                     mv[list][0] = mv[list][1] = 0;
1462             }
1463         }
1464
1465         if(ref[1] < 0){
1466             *mb_type &= ~MB_TYPE_P0L1;
1467             sub_mb_type &= ~MB_TYPE_P0L1;
1468         }else if(ref[0] < 0){
1469             *mb_type &= ~MB_TYPE_P0L0;
1470             sub_mb_type &= ~MB_TYPE_P0L0;
1471         }
1472
1473         if(IS_16X16(*mb_type)){
1474             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1475             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1476             if(!IS_INTRA(mb_type_col)
1477                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1478                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1479                        && (h->x264_build>33 || !h->x264_build)))){
1480                 if(ref[0] > 0)
1481                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1482                 else
1483                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1484                 if(ref[1] > 0)
1485                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1486                 else
1487                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1488             }else{
1489                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1490                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1491             }
1492         }else{
1493             for(i8=0; i8<4; i8++){
1494                 const int x8 = i8&1;
1495                 const int y8 = i8>>1;
1496
1497                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1498                     continue;
1499                 h->sub_mb_type[i8] = sub_mb_type;
1500
1501                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1502                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1503                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1504                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1505
1506                 /* col_zero_flag */
1507                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1508                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1509                                                   && (h->x264_build>33 || !h->x264_build)))){
1510                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1511                     if(IS_SUB_8X8(sub_mb_type)){
1512                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1513                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1514                             if(ref[0] == 0)
1515                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1516                             if(ref[1] == 0)
1517                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1518                         }
1519                     }else
1520                     for(i4=0; i4<4; i4++){
1521                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1522                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1523                             if(ref[0] == 0)
1524                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1525                             if(ref[1] == 0)
1526                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1527                         }
1528                     }
1529                 }
1530             }
1531         }
1532     }else{ /* direct temporal mv pred */
1533         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1534         const int *dist_scale_factor = h->dist_scale_factor;
1535
1536         if(FRAME_MBAFF){
1537             if(IS_INTERLACED(*mb_type)){
1538                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1539                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1540                 dist_scale_factor = h->dist_scale_factor_field;
1541             }
1542             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1543                 /* FIXME assumes direct_8x8_inference == 1 */
1544                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1545                 int mb_types_col[2];
1546                 int y_shift;
1547
1548                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1549                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1550                          | (*mb_type & MB_TYPE_INTERLACED);
1551                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1552
1553                 if(IS_INTERLACED(*mb_type)){
1554                     /* frame to field scaling */
1555                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1556                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1557                     if(s->mb_y&1){
1558                         l1ref0 -= 2*h->b8_stride;
1559                         l1ref1 -= 2*h->b8_stride;
1560                         l1mv0 -= 4*h->b_stride;
1561                         l1mv1 -= 4*h->b_stride;
1562                     }
1563                     y_shift = 0;
1564
1565                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1566                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1567                        && !is_b8x8)
1568                         *mb_type |= MB_TYPE_16x8;
1569                     else
1570                         *mb_type |= MB_TYPE_8x8;
1571                 }else{
1572                     /* field to frame scaling */
1573                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1574                      * but in MBAFF, top and bottom POC are equal */
1575                     int dy = (s->mb_y&1) ? 1 : 2;
1576                     mb_types_col[0] =
1577                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1578                     l1ref0 += dy*h->b8_stride;
1579                     l1ref1 += dy*h->b8_stride;
1580                     l1mv0 += 2*dy*h->b_stride;
1581                     l1mv1 += 2*dy*h->b_stride;
1582                     y_shift = 2;
1583
1584                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1585                        && !is_b8x8)
1586                         *mb_type |= MB_TYPE_16x16;
1587                     else
1588                         *mb_type |= MB_TYPE_8x8;
1589                 }
1590
1591                 for(i8=0; i8<4; i8++){
1592                     const int x8 = i8&1;
1593                     const int y8 = i8>>1;
1594                     int ref0, scale;
1595                     const int16_t (*l1mv)[2]= l1mv0;
1596
1597                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1598                         continue;
1599                     h->sub_mb_type[i8] = sub_mb_type;
1600
1601                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1602                     if(IS_INTRA(mb_types_col[y8])){
1603                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1604                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1605                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1606                         continue;
1607                     }
1608
1609                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1610                     if(ref0 >= 0)
1611                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1612                     else{
1613                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1614                         l1mv= l1mv1;
1615                     }
1616                     scale = dist_scale_factor[ref0];
1617                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1618
1619                     {
1620                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1621                         int my_col = (mv_col[1]<<y_shift)/2;
1622                         int mx = (scale * mv_col[0] + 128) >> 8;
1623                         int my = (scale * my_col + 128) >> 8;
1624                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1625                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1626                     }
1627                 }
1628                 return;
1629             }
1630         }
1631
1632         /* one-to-one mv scaling */
1633
1634         if(IS_16X16(*mb_type)){
1635             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1636             if(IS_INTRA(mb_type_col)){
1637                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1638                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1639                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1640             }else{
1641                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1642                                                 : map_col_to_list0[1][l1ref1[0]];
1643                 const int scale = dist_scale_factor[ref0];
1644                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1645                 int mv_l0[2];
1646                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1647                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1648                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1649                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1650                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1651             }
1652         }else{
1653             for(i8=0; i8<4; i8++){
1654                 const int x8 = i8&1;
1655                 const int y8 = i8>>1;
1656                 int ref0, scale;
1657                 const int16_t (*l1mv)[2]= l1mv0;
1658
1659                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1660                     continue;
1661                 h->sub_mb_type[i8] = sub_mb_type;
1662                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1663                 if(IS_INTRA(mb_type_col)){
1664                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1665                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1666                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1667                     continue;
1668                 }
1669
1670                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1671                 if(ref0 >= 0)
1672                     ref0 = map_col_to_list0[0][ref0];
1673                 else{
1674                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1675                     l1mv= l1mv1;
1676                 }
1677                 scale = dist_scale_factor[ref0];
1678
1679                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1680                 if(IS_SUB_8X8(sub_mb_type)){
1681                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1682                     int mx = (scale * mv_col[0] + 128) >> 8;
1683                     int my = (scale * mv_col[1] + 128) >> 8;
1684                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1685                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1686                 }else
1687                 for(i4=0; i4<4; i4++){
1688                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1689                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1690                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1691                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1692                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1693                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1694                 }
1695             }
1696         }
1697     }
1698 }
1699
1700 static inline void write_back_motion(H264Context *h, int mb_type){
1701     MpegEncContext * const s = &h->s;
1702     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1703     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1704     int list;
1705
1706     if(!USES_LIST(mb_type, 0))
1707         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1708
1709     for(list=0; list<2; list++){
1710         int y;
1711         if(!USES_LIST(mb_type, list))
1712             continue;
1713
1714         for(y=0; y<4; y++){
1715             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1716             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1717         }
1718         if( h->pps.cabac ) {
1719             if(IS_SKIP(mb_type))
1720                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1721             else
1722             for(y=0; y<4; y++){
1723                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1724                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1725             }
1726         }
1727
1728         {
1729             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1730             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1731             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1732             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1733             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1734         }
1735     }
1736
1737     if(h->slice_type == B_TYPE && h->pps.cabac){
1738         if(IS_8X8(mb_type)){
1739             uint8_t *direct_table = &h->direct_table[b8_xy];
1740             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1741             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1742             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1743         }
1744     }
1745 }
1746
1747 /**
1748  * Decodes a network abstraction layer unit.
1749  * @param consumed is the number of bytes used as input
1750  * @param length is the length of the array
1751  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1752  * @returns decoded bytes, might be src+1 if no escapes
1753  */
1754 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1755     int i, si, di;
1756     uint8_t *dst;
1757
1758 //    src[0]&0x80;                //forbidden bit
1759     h->nal_ref_idc= src[0]>>5;
1760     h->nal_unit_type= src[0]&0x1F;
1761
1762     src++; length--;
1763 #if 0
1764     for(i=0; i<length; i++)
1765         printf("%2X ", src[i]);
1766 #endif
1767     for(i=0; i+1<length; i+=2){
1768         if(src[i]) continue;
1769         if(i>0 && src[i-1]==0) i--;
1770         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1771             if(src[i+2]!=3){
1772                 /* startcode, so we must be past the end */
1773                 length=i;
1774             }
1775             break;
1776         }
1777     }
1778
1779     if(i>=length-1){ //no escaped 0
1780         *dst_length= length;
1781         *consumed= length+1; //+1 for the header
1782         return src;
1783     }
1784
1785     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1786     dst= h->rbsp_buffer;
1787
1788 //printf("decoding esc\n");
1789     si=di=0;
1790     while(si<length){
1791         //remove escapes (very rare 1:2^22)
1792         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1793             if(src[si+2]==3){ //escape
1794                 dst[di++]= 0;
1795                 dst[di++]= 0;
1796                 si+=3;
1797                 continue;
1798             }else //next start code
1799                 break;
1800         }
1801
1802         dst[di++]= src[si++];
1803     }
1804
1805     *dst_length= di;
1806     *consumed= si + 1;//+1 for the header
1807 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1808     return dst;
1809 }
1810
1811 #if 0
1812 /**
1813  * @param src the data which should be escaped
1814  * @param dst the target buffer, dst+1 == src is allowed as a special case
1815  * @param length the length of the src data
1816  * @param dst_length the length of the dst array
1817  * @returns length of escaped data in bytes or -1 if an error occured
1818  */
1819 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1820     int i, escape_count, si, di;
1821     uint8_t *temp;
1822
1823     assert(length>=0);
1824     assert(dst_length>0);
1825
1826     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1827
1828     if(length==0) return 1;
1829
1830     escape_count= 0;
1831     for(i=0; i<length; i+=2){
1832         if(src[i]) continue;
1833         if(i>0 && src[i-1]==0)
1834             i--;
1835         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1836             escape_count++;
1837             i+=2;
1838         }
1839     }
1840
1841     if(escape_count==0){
1842         if(dst+1 != src)
1843             memcpy(dst+1, src, length);
1844         return length + 1;
1845     }
1846
1847     if(length + escape_count + 1> dst_length)
1848         return -1;
1849
1850     //this should be damn rare (hopefully)
1851
1852     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1853     temp= h->rbsp_buffer;
1854 //printf("encoding esc\n");
1855
1856     si= 0;
1857     di= 0;
1858     while(si < length){
1859         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1860             temp[di++]= 0; si++;
1861             temp[di++]= 0; si++;
1862             temp[di++]= 3;
1863             temp[di++]= src[si++];
1864         }
1865         else
1866             temp[di++]= src[si++];
1867     }
1868     memcpy(dst+1, temp, length+escape_count);
1869
1870     assert(di == length+escape_count);
1871
1872     return di + 1;
1873 }
1874
1875 /**
1876  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1877  */
1878 static void encode_rbsp_trailing(PutBitContext *pb){
1879     int length;
1880     put_bits(pb, 1, 1);
1881     length= (-put_bits_count(pb))&7;
1882     if(length) put_bits(pb, length, 0);
1883 }
1884 #endif
1885
1886 /**
1887  * identifies the exact end of the bitstream
1888  * @return the length of the trailing, or 0 if damaged
1889  */
1890 static int decode_rbsp_trailing(uint8_t *src){
1891     int v= *src;
1892     int r;
1893
1894     tprintf("rbsp trailing %X\n", v);
1895
1896     for(r=1; r<9; r++){
1897         if(v&1) return r;
1898         v>>=1;
1899     }
1900     return 0;
1901 }
1902
1903 /**
1904  * idct tranforms the 16 dc values and dequantize them.
1905  * @param qp quantization parameter
1906  */
1907 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1908 #define stride 16
1909     int i;
1910     int temp[16]; //FIXME check if this is a good idea
1911     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1912     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1913
1914 //memset(block, 64, 2*256);
1915 //return;
1916     for(i=0; i<4; i++){
1917         const int offset= y_offset[i];
1918         const int z0= block[offset+stride*0] + block[offset+stride*4];
1919         const int z1= block[offset+stride*0] - block[offset+stride*4];
1920         const int z2= block[offset+stride*1] - block[offset+stride*5];
1921         const int z3= block[offset+stride*1] + block[offset+stride*5];
1922
1923         temp[4*i+0]= z0+z3;
1924         temp[4*i+1]= z1+z2;
1925         temp[4*i+2]= z1-z2;
1926         temp[4*i+3]= z0-z3;
1927     }
1928
1929     for(i=0; i<4; i++){
1930         const int offset= x_offset[i];
1931         const int z0= temp[4*0+i] + temp[4*2+i];
1932         const int z1= temp[4*0+i] - temp[4*2+i];
1933         const int z2= temp[4*1+i] - temp[4*3+i];
1934         const int z3= temp[4*1+i] + temp[4*3+i];
1935
1936         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1937         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1938         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1939         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1940     }
1941 }
1942
1943 #if 0
1944 /**
1945  * dct tranforms the 16 dc values.
1946  * @param qp quantization parameter ??? FIXME
1947  */
1948 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1949 //    const int qmul= dequant_coeff[qp][0];
1950     int i;
1951     int temp[16]; //FIXME check if this is a good idea
1952     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1953     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1954
1955     for(i=0; i<4; i++){
1956         const int offset= y_offset[i];
1957         const int z0= block[offset+stride*0] + block[offset+stride*4];
1958         const int z1= block[offset+stride*0] - block[offset+stride*4];
1959         const int z2= block[offset+stride*1] - block[offset+stride*5];
1960         const int z3= block[offset+stride*1] + block[offset+stride*5];
1961
1962         temp[4*i+0]= z0+z3;
1963         temp[4*i+1]= z1+z2;
1964         temp[4*i+2]= z1-z2;
1965         temp[4*i+3]= z0-z3;
1966     }
1967
1968     for(i=0; i<4; i++){
1969         const int offset= x_offset[i];
1970         const int z0= temp[4*0+i] + temp[4*2+i];
1971         const int z1= temp[4*0+i] - temp[4*2+i];
1972         const int z2= temp[4*1+i] - temp[4*3+i];
1973         const int z3= temp[4*1+i] + temp[4*3+i];
1974
1975         block[stride*0 +offset]= (z0 + z3)>>1;
1976         block[stride*2 +offset]= (z1 + z2)>>1;
1977         block[stride*8 +offset]= (z1 - z2)>>1;
1978         block[stride*10+offset]= (z0 - z3)>>1;
1979     }
1980 }
1981 #endif
1982
1983 #undef xStride
1984 #undef stride
1985
1986 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1987     const int stride= 16*2;
1988     const int xStride= 16;
1989     int a,b,c,d,e;
1990
1991     a= block[stride*0 + xStride*0];
1992     b= block[stride*0 + xStride*1];
1993     c= block[stride*1 + xStride*0];
1994     d= block[stride*1 + xStride*1];
1995
1996     e= a-b;
1997     a= a+b;
1998     b= c-d;
1999     c= c+d;
2000
2001     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
2002     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
2003     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
2004     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
2005 }
2006
2007 #if 0
2008 static void chroma_dc_dct_c(DCTELEM *block){
2009     const int stride= 16*2;
2010     const int xStride= 16;
2011     int a,b,c,d,e;
2012
2013     a= block[stride*0 + xStride*0];
2014     b= block[stride*0 + xStride*1];
2015     c= block[stride*1 + xStride*0];
2016     d= block[stride*1 + xStride*1];
2017
2018     e= a-b;
2019     a= a+b;
2020     b= c-d;
2021     c= c+d;
2022
2023     block[stride*0 + xStride*0]= (a+c);
2024     block[stride*0 + xStride*1]= (e+b);
2025     block[stride*1 + xStride*0]= (a-c);
2026     block[stride*1 + xStride*1]= (e-b);
2027 }
2028 #endif
2029
2030 /**
2031  * gets the chroma qp.
2032  */
2033 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
2034
2035     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
2036 }
2037
2038
2039 #if 0
2040 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
2041     int i;
2042     //FIXME try int temp instead of block
2043
2044     for(i=0; i<4; i++){
2045         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
2046         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
2047         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
2048         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
2049         const int z0= d0 + d3;
2050         const int z3= d0 - d3;
2051         const int z1= d1 + d2;
2052         const int z2= d1 - d2;
2053
2054         block[0 + 4*i]=   z0 +   z1;
2055         block[1 + 4*i]= 2*z3 +   z2;
2056         block[2 + 4*i]=   z0 -   z1;
2057         block[3 + 4*i]=   z3 - 2*z2;
2058     }
2059
2060     for(i=0; i<4; i++){
2061         const int z0= block[0*4 + i] + block[3*4 + i];
2062         const int z3= block[0*4 + i] - block[3*4 + i];
2063         const int z1= block[1*4 + i] + block[2*4 + i];
2064         const int z2= block[1*4 + i] - block[2*4 + i];
2065
2066         block[0*4 + i]=   z0 +   z1;
2067         block[1*4 + i]= 2*z3 +   z2;
2068         block[2*4 + i]=   z0 -   z1;
2069         block[3*4 + i]=   z3 - 2*z2;
2070     }
2071 }
2072 #endif
2073
2074 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
2075 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
2076 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
2077     int i;
2078     const int * const quant_table= quant_coeff[qscale];
2079     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
2080     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
2081     const unsigned int threshold2= (threshold1<<1);
2082     int last_non_zero;
2083
2084     if(seperate_dc){
2085         if(qscale<=18){
2086             //avoid overflows
2087             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
2088             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
2089             const unsigned int dc_threshold2= (dc_threshold1<<1);
2090
2091             int level= block[0]*quant_coeff[qscale+18][0];
2092             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2093                 if(level>0){
2094                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
2095                     block[0]= level;
2096                 }else{
2097                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
2098                     block[0]= -level;
2099                 }
2100 //                last_non_zero = i;
2101             }else{
2102                 block[0]=0;
2103             }
2104         }else{
2105             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
2106             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
2107             const unsigned int dc_threshold2= (dc_threshold1<<1);
2108
2109             int level= block[0]*quant_table[0];
2110             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2111                 if(level>0){
2112                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
2113                     block[0]= level;
2114                 }else{
2115                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
2116                     block[0]= -level;
2117                 }
2118 //                last_non_zero = i;
2119             }else{
2120                 block[0]=0;
2121             }
2122         }
2123         last_non_zero= 0;
2124         i=1;
2125     }else{
2126         last_non_zero= -1;
2127         i=0;
2128     }
2129
2130     for(; i<16; i++){
2131         const int j= scantable[i];
2132         int level= block[j]*quant_table[j];
2133
2134 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2135 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2136         if(((unsigned)(level+threshold1))>threshold2){
2137             if(level>0){
2138                 level= (bias + level)>>QUANT_SHIFT;
2139                 block[j]= level;
2140             }else{
2141                 level= (bias - level)>>QUANT_SHIFT;
2142                 block[j]= -level;
2143             }
2144             last_non_zero = i;
2145         }else{
2146             block[j]=0;
2147         }
2148     }
2149
2150     return last_non_zero;
2151 }
2152
2153 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2154     const uint32_t a= ((uint32_t*)(src-stride))[0];
2155     ((uint32_t*)(src+0*stride))[0]= a;
2156     ((uint32_t*)(src+1*stride))[0]= a;
2157     ((uint32_t*)(src+2*stride))[0]= a;
2158     ((uint32_t*)(src+3*stride))[0]= a;
2159 }
2160
2161 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2162     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2163     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2164     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2165     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2166 }
2167
2168 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2169     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2170                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2171
2172     ((uint32_t*)(src+0*stride))[0]=
2173     ((uint32_t*)(src+1*stride))[0]=
2174     ((uint32_t*)(src+2*stride))[0]=
2175     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2176 }
2177
2178 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2179     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2180
2181     ((uint32_t*)(src+0*stride))[0]=
2182     ((uint32_t*)(src+1*stride))[0]=
2183     ((uint32_t*)(src+2*stride))[0]=
2184     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2185 }
2186
2187 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2188     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2189
2190     ((uint32_t*)(src+0*stride))[0]=
2191     ((uint32_t*)(src+1*stride))[0]=
2192     ((uint32_t*)(src+2*stride))[0]=
2193     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2194 }
2195
2196 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2197     ((uint32_t*)(src+0*stride))[0]=
2198     ((uint32_t*)(src+1*stride))[0]=
2199     ((uint32_t*)(src+2*stride))[0]=
2200     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2201 }
2202
2203
2204 #define LOAD_TOP_RIGHT_EDGE\
2205     const int t4= topright[0];\
2206     const int t5= topright[1];\
2207     const int t6= topright[2];\
2208     const int t7= topright[3];\
2209
2210 #define LOAD_LEFT_EDGE\
2211     const int l0= src[-1+0*stride];\
2212     const int l1= src[-1+1*stride];\
2213     const int l2= src[-1+2*stride];\
2214     const int l3= src[-1+3*stride];\
2215
2216 #define LOAD_TOP_EDGE\
2217     const int t0= src[ 0-1*stride];\
2218     const int t1= src[ 1-1*stride];\
2219     const int t2= src[ 2-1*stride];\
2220     const int t3= src[ 3-1*stride];\
2221
2222 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2223     const int lt= src[-1-1*stride];
2224     LOAD_TOP_EDGE
2225     LOAD_LEFT_EDGE
2226
2227     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2228     src[0+2*stride]=
2229     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2230     src[0+1*stride]=
2231     src[1+2*stride]=
2232     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2233     src[0+0*stride]=
2234     src[1+1*stride]=
2235     src[2+2*stride]=
2236     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2237     src[1+0*stride]=
2238     src[2+1*stride]=
2239     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2240     src[2+0*stride]=
2241     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2242     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2243 }
2244
2245 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2246     LOAD_TOP_EDGE
2247     LOAD_TOP_RIGHT_EDGE
2248 //    LOAD_LEFT_EDGE
2249
2250     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2251     src[1+0*stride]=
2252     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2253     src[2+0*stride]=
2254     src[1+1*stride]=
2255     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2256     src[3+0*stride]=
2257     src[2+1*stride]=
2258     src[1+2*stride]=
2259     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2260     src[3+1*stride]=
2261     src[2+2*stride]=
2262     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2263     src[3+2*stride]=
2264     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2265     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2266 }
2267
2268 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2269     const int lt= src[-1-1*stride];
2270     LOAD_TOP_EDGE
2271     LOAD_LEFT_EDGE
2272     const __attribute__((unused)) int unu= l3;
2273
2274     src[0+0*stride]=
2275     src[1+2*stride]=(lt + t0 + 1)>>1;
2276     src[1+0*stride]=
2277     src[2+2*stride]=(t0 + t1 + 1)>>1;
2278     src[2+0*stride]=
2279     src[3+2*stride]=(t1 + t2 + 1)>>1;
2280     src[3+0*stride]=(t2 + t3 + 1)>>1;
2281     src[0+1*stride]=
2282     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2283     src[1+1*stride]=
2284     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2285     src[2+1*stride]=
2286     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2287     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2288     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2289     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2290 }
2291
2292 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2293     LOAD_TOP_EDGE
2294     LOAD_TOP_RIGHT_EDGE
2295     const __attribute__((unused)) int unu= t7;
2296
2297     src[0+0*stride]=(t0 + t1 + 1)>>1;
2298     src[1+0*stride]=
2299     src[0+2*stride]=(t1 + t2 + 1)>>1;
2300     src[2+0*stride]=
2301     src[1+2*stride]=(t2 + t3 + 1)>>1;
2302     src[3+0*stride]=
2303     src[2+2*stride]=(t3 + t4+ 1)>>1;
2304     src[3+2*stride]=(t4 + t5+ 1)>>1;
2305     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2306     src[1+1*stride]=
2307     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2308     src[2+1*stride]=
2309     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2310     src[3+1*stride]=
2311     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2312     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2313 }
2314
2315 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2316     LOAD_LEFT_EDGE
2317
2318     src[0+0*stride]=(l0 + l1 + 1)>>1;
2319     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2320     src[2+0*stride]=
2321     src[0+1*stride]=(l1 + l2 + 1)>>1;
2322     src[3+0*stride]=
2323     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2324     src[2+1*stride]=
2325     src[0+2*stride]=(l2 + l3 + 1)>>1;
2326     src[3+1*stride]=
2327     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2328     src[3+2*stride]=
2329     src[1+3*stride]=
2330     src[0+3*stride]=
2331     src[2+2*stride]=
2332     src[2+3*stride]=
2333     src[3+3*stride]=l3;
2334 }
2335
2336 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2337     const int lt= src[-1-1*stride];
2338     LOAD_TOP_EDGE
2339     LOAD_LEFT_EDGE
2340     const __attribute__((unused)) int unu= t3;
2341
2342     src[0+0*stride]=
2343     src[2+1*stride]=(lt + l0 + 1)>>1;
2344     src[1+0*stride]=
2345     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2346     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2347     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2348     src[0+1*stride]=
2349     src[2+2*stride]=(l0 + l1 + 1)>>1;
2350     src[1+1*stride]=
2351     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2352     src[0+2*stride]=
2353     src[2+3*stride]=(l1 + l2+ 1)>>1;
2354     src[1+2*stride]=
2355     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2356     src[0+3*stride]=(l2 + l3 + 1)>>1;
2357     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2358 }
2359
2360 static void pred16x16_vertical_c(uint8_t *src, int stride){
2361     int i;
2362     const uint32_t a= ((uint32_t*)(src-stride))[0];
2363     const uint32_t b= ((uint32_t*)(src-stride))[1];
2364     const uint32_t c= ((uint32_t*)(src-stride))[2];
2365     const uint32_t d= ((uint32_t*)(src-stride))[3];
2366
2367     for(i=0; i<16; i++){
2368         ((uint32_t*)(src+i*stride))[0]= a;
2369         ((uint32_t*)(src+i*stride))[1]= b;
2370         ((uint32_t*)(src+i*stride))[2]= c;
2371         ((uint32_t*)(src+i*stride))[3]= d;
2372     }
2373 }
2374
2375 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2376     int i;
2377
2378     for(i=0; i<16; i++){
2379         ((uint32_t*)(src+i*stride))[0]=
2380         ((uint32_t*)(src+i*stride))[1]=
2381         ((uint32_t*)(src+i*stride))[2]=
2382         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2383     }
2384 }
2385
2386 static void pred16x16_dc_c(uint8_t *src, int stride){
2387     int i, dc=0;
2388
2389     for(i=0;i<16; i++){
2390         dc+= src[-1+i*stride];
2391     }
2392
2393     for(i=0;i<16; i++){
2394         dc+= src[i-stride];
2395     }
2396
2397     dc= 0x01010101*((dc + 16)>>5);
2398
2399     for(i=0; i<16; i++){
2400         ((uint32_t*)(src+i*stride))[0]=
2401         ((uint32_t*)(src+i*stride))[1]=
2402         ((uint32_t*)(src+i*stride))[2]=
2403         ((uint32_t*)(src+i*stride))[3]= dc;
2404     }
2405 }
2406
2407 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2408     int i, dc=0;
2409
2410     for(i=0;i<16; i++){
2411         dc+= src[-1+i*stride];
2412     }
2413
2414     dc= 0x01010101*((dc + 8)>>4);
2415
2416     for(i=0; i<16; i++){
2417         ((uint32_t*)(src+i*stride))[0]=
2418         ((uint32_t*)(src+i*stride))[1]=
2419         ((uint32_t*)(src+i*stride))[2]=
2420         ((uint32_t*)(src+i*stride))[3]= dc;
2421     }
2422 }
2423
2424 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2425     int i, dc=0;
2426
2427     for(i=0;i<16; i++){
2428         dc+= src[i-stride];
2429     }
2430     dc= 0x01010101*((dc + 8)>>4);
2431
2432     for(i=0; i<16; i++){
2433         ((uint32_t*)(src+i*stride))[0]=
2434         ((uint32_t*)(src+i*stride))[1]=
2435         ((uint32_t*)(src+i*stride))[2]=
2436         ((uint32_t*)(src+i*stride))[3]= dc;
2437     }
2438 }
2439
2440 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2441     int i;
2442
2443     for(i=0; i<16; i++){
2444         ((uint32_t*)(src+i*stride))[0]=
2445         ((uint32_t*)(src+i*stride))[1]=
2446         ((uint32_t*)(src+i*stride))[2]=
2447         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2448     }
2449 }
2450
2451 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2452   int i, j, k;
2453   int a;
2454   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2455   const uint8_t * const src0 = src+7-stride;
2456   const uint8_t *src1 = src+8*stride-1;
2457   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2458   int H = src0[1] - src0[-1];
2459   int V = src1[0] - src2[ 0];
2460   for(k=2; k<=8; ++k) {
2461     src1 += stride; src2 -= stride;
2462     H += k*(src0[k] - src0[-k]);
2463     V += k*(src1[0] - src2[ 0]);
2464   }
2465   if(svq3){
2466     H = ( 5*(H/4) ) / 16;
2467     V = ( 5*(V/4) ) / 16;
2468
2469     /* required for 100% accuracy */
2470     i = H; H = V; V = i;
2471   }else{
2472     H = ( 5*H+32 ) >> 6;
2473     V = ( 5*V+32 ) >> 6;
2474   }
2475
2476   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2477   for(j=16; j>0; --j) {
2478     int b = a;
2479     a += V;
2480     for(i=-16; i<0; i+=4) {
2481       src[16+i] = cm[ (b    ) >> 5 ];
2482       src[17+i] = cm[ (b+  H) >> 5 ];
2483       src[18+i] = cm[ (b+2*H) >> 5 ];
2484       src[19+i] = cm[ (b+3*H) >> 5 ];
2485       b += 4*H;
2486     }
2487     src += stride;
2488   }
2489 }
2490
2491 static void pred16x16_plane_c(uint8_t *src, int stride){
2492     pred16x16_plane_compat_c(src, stride, 0);
2493 }
2494
2495 static void pred8x8_vertical_c(uint8_t *src, int stride){
2496     int i;
2497     const uint32_t a= ((uint32_t*)(src-stride))[0];
2498     const uint32_t b= ((uint32_t*)(src-stride))[1];
2499
2500     for(i=0; i<8; i++){
2501         ((uint32_t*)(src+i*stride))[0]= a;
2502         ((uint32_t*)(src+i*stride))[1]= b;
2503     }
2504 }
2505
2506 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2507     int i;
2508
2509     for(i=0; i<8; i++){
2510         ((uint32_t*)(src+i*stride))[0]=
2511         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2512     }
2513 }
2514
2515 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2516     int i;
2517
2518     for(i=0; i<8; i++){
2519         ((uint32_t*)(src+i*stride))[0]=
2520         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2521     }
2522 }
2523
2524 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2525     int i;
2526     int dc0, dc2;
2527
2528     dc0=dc2=0;
2529     for(i=0;i<4; i++){
2530         dc0+= src[-1+i*stride];
2531         dc2+= src[-1+(i+4)*stride];
2532     }
2533     dc0= 0x01010101*((dc0 + 2)>>2);
2534     dc2= 0x01010101*((dc2 + 2)>>2);
2535
2536     for(i=0; i<4; i++){
2537         ((uint32_t*)(src+i*stride))[0]=
2538         ((uint32_t*)(src+i*stride))[1]= dc0;
2539     }
2540     for(i=4; i<8; i++){
2541         ((uint32_t*)(src+i*stride))[0]=
2542         ((uint32_t*)(src+i*stride))[1]= dc2;
2543     }
2544 }
2545
2546 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2547     int i;
2548     int dc0, dc1;
2549
2550     dc0=dc1=0;
2551     for(i=0;i<4; i++){
2552         dc0+= src[i-stride];
2553         dc1+= src[4+i-stride];
2554     }
2555     dc0= 0x01010101*((dc0 + 2)>>2);
2556     dc1= 0x01010101*((dc1 + 2)>>2);
2557
2558     for(i=0; i<4; i++){
2559         ((uint32_t*)(src+i*stride))[0]= dc0;
2560         ((uint32_t*)(src+i*stride))[1]= dc1;
2561     }
2562     for(i=4; i<8; i++){
2563         ((uint32_t*)(src+i*stride))[0]= dc0;
2564         ((uint32_t*)(src+i*stride))[1]= dc1;
2565     }
2566 }
2567
2568
2569 static void pred8x8_dc_c(uint8_t *src, int stride){
2570     int i;
2571     int dc0, dc1, dc2, dc3;
2572
2573     dc0=dc1=dc2=0;
2574     for(i=0;i<4; i++){
2575         dc0+= src[-1+i*stride] + src[i-stride];
2576         dc1+= src[4+i-stride];
2577         dc2+= src[-1+(i+4)*stride];
2578     }
2579     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2580     dc0= 0x01010101*((dc0 + 4)>>3);
2581     dc1= 0x01010101*((dc1 + 2)>>2);
2582     dc2= 0x01010101*((dc2 + 2)>>2);
2583
2584     for(i=0; i<4; i++){
2585         ((uint32_t*)(src+i*stride))[0]= dc0;
2586         ((uint32_t*)(src+i*stride))[1]= dc1;
2587     }
2588     for(i=4; i<8; i++){
2589         ((uint32_t*)(src+i*stride))[0]= dc2;
2590         ((uint32_t*)(src+i*stride))[1]= dc3;
2591     }
2592 }
2593
2594 static void pred8x8_plane_c(uint8_t *src, int stride){
2595   int j, k;
2596   int a;
2597   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2598   const uint8_t * const src0 = src+3-stride;
2599   const uint8_t *src1 = src+4*stride-1;
2600   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2601   int H = src0[1] - src0[-1];
2602   int V = src1[0] - src2[ 0];
2603   for(k=2; k<=4; ++k) {
2604     src1 += stride; src2 -= stride;
2605     H += k*(src0[k] - src0[-k]);
2606     V += k*(src1[0] - src2[ 0]);
2607   }
2608   H = ( 17*H+16 ) >> 5;
2609   V = ( 17*V+16 ) >> 5;
2610
2611   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2612   for(j=8; j>0; --j) {
2613     int b = a;
2614     a += V;
2615     src[0] = cm[ (b    ) >> 5 ];
2616     src[1] = cm[ (b+  H) >> 5 ];
2617     src[2] = cm[ (b+2*H) >> 5 ];
2618     src[3] = cm[ (b+3*H) >> 5 ];
2619     src[4] = cm[ (b+4*H) >> 5 ];
2620     src[5] = cm[ (b+5*H) >> 5 ];
2621     src[6] = cm[ (b+6*H) >> 5 ];
2622     src[7] = cm[ (b+7*H) >> 5 ];
2623     src += stride;
2624   }
2625 }
2626
2627 #define SRC(x,y) src[(x)+(y)*stride]
2628 #define PL(y) \
2629     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2630 #define PREDICT_8x8_LOAD_LEFT \
2631     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2632                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2633     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2634     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2635
2636 #define PT(x) \
2637     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2638 #define PREDICT_8x8_LOAD_TOP \
2639     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2640                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2641     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2642     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2643                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2644
2645 #define PTR(x) \
2646     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2647 #define PREDICT_8x8_LOAD_TOPRIGHT \
2648     int t8, t9, t10, t11, t12, t13, t14, t15; \
2649     if(has_topright) { \
2650         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2651         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2652     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2653
2654 #define PREDICT_8x8_LOAD_TOPLEFT \
2655     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2656
2657 #define PREDICT_8x8_DC(v) \
2658     int y; \
2659     for( y = 0; y < 8; y++ ) { \
2660         ((uint32_t*)src)[0] = \
2661         ((uint32_t*)src)[1] = v; \
2662         src += stride; \
2663     }
2664
2665 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2666 {
2667     PREDICT_8x8_DC(0x80808080);
2668 }
2669 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2670 {
2671     PREDICT_8x8_LOAD_LEFT;
2672     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2673     PREDICT_8x8_DC(dc);
2674 }
2675 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2676 {
2677     PREDICT_8x8_LOAD_TOP;
2678     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2679     PREDICT_8x8_DC(dc);
2680 }
2681 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2682 {
2683     PREDICT_8x8_LOAD_LEFT;
2684     PREDICT_8x8_LOAD_TOP;
2685     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2686                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2687     PREDICT_8x8_DC(dc);
2688 }
2689 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2690 {
2691     PREDICT_8x8_LOAD_LEFT;
2692 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2693                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2694     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2695 #undef ROW
2696 }
2697 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2698 {
2699     int y;
2700     PREDICT_8x8_LOAD_TOP;
2701     src[0] = t0;
2702     src[1] = t1;
2703     src[2] = t2;
2704     src[3] = t3;
2705     src[4] = t4;
2706     src[5] = t5;
2707     src[6] = t6;
2708     src[7] = t7;
2709     for( y = 1; y < 8; y++ )
2710         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2711 }
2712 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2713 {
2714     PREDICT_8x8_LOAD_TOP;
2715     PREDICT_8x8_LOAD_TOPRIGHT;
2716     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2717     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2718     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2719     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2720     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2721     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2722     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2723     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2724     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2725     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2726     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2727     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2728     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2729     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2730     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2731 }
2732 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2733 {
2734     PREDICT_8x8_LOAD_TOP;
2735     PREDICT_8x8_LOAD_LEFT;
2736     PREDICT_8x8_LOAD_TOPLEFT;
2737     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2738     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2739     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2740     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2741     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2742     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2743     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2744     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2745     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2746     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2747     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2748     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2749     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2750     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2751     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2752
2753 }
2754 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2755 {
2756     PREDICT_8x8_LOAD_TOP;
2757     PREDICT_8x8_LOAD_LEFT;
2758     PREDICT_8x8_LOAD_TOPLEFT;
2759     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2760     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2761     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2762     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2763     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2764     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2765     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2766     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2767     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2768     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2769     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2770     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2771     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2772     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2773     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2774     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2775     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2776     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2777     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2778     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2779     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2780     SRC(7,0)= (t6 + t7 + 1) >> 1;
2781 }
2782 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2783 {
2784     PREDICT_8x8_LOAD_TOP;
2785     PREDICT_8x8_LOAD_LEFT;
2786     PREDICT_8x8_LOAD_TOPLEFT;
2787     SRC(0,7)= (l6 + l7 + 1) >> 1;
2788     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2789     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2790     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2791     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2792     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2793     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2794     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2795     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2796     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2797     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2798     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2799     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2800     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2801     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2802     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2803     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2804     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2805     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2806     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2807     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2808     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2809 }
2810 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2811 {
2812     PREDICT_8x8_LOAD_TOP;
2813     PREDICT_8x8_LOAD_TOPRIGHT;
2814     SRC(0,0)= (t0 + t1 + 1) >> 1;
2815     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2816     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2817     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2818     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2819     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2820     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2821     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2822     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2823     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2824     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2825     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2826     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2827     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2828     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2829     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2830     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2831     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2832     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2833     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2834     SRC(7,6)= (t10 + t11 + 1) >> 1;
2835     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2836 }
2837 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2838 {
2839     PREDICT_8x8_LOAD_LEFT;
2840     SRC(0,0)= (l0 + l1 + 1) >> 1;
2841     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2842     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2843     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2844     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2845     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2846     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2847     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2848     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2849     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2850     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2851     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2852     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2853     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2854     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2855     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2856     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2857     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2858 }
2859 #undef PREDICT_8x8_LOAD_LEFT
2860 #undef PREDICT_8x8_LOAD_TOP
2861 #undef PREDICT_8x8_LOAD_TOPLEFT
2862 #undef PREDICT_8x8_LOAD_TOPRIGHT
2863 #undef PREDICT_8x8_DC
2864 #undef PTR
2865 #undef PT
2866 #undef PL
2867 #undef SRC
2868
2869 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2870                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2871                            int src_x_offset, int src_y_offset,
2872                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2873     MpegEncContext * const s = &h->s;
2874     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2875     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2876     const int luma_xy= (mx&3) + ((my&3)<<2);
2877     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2878     uint8_t * src_cb, * src_cr;
2879     int extra_width= h->emu_edge_width;
2880     int extra_height= h->emu_edge_height;
2881     int emu=0;
2882     const int full_mx= mx>>2;
2883     const int full_my= my>>2;
2884     const int pic_width  = 16*s->mb_width;
2885     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2886
2887     if(!pic->data[0])
2888         return;
2889
2890     if(mx&7) extra_width -= 3;
2891     if(my&7) extra_height -= 3;
2892
2893     if(   full_mx < 0-extra_width
2894        || full_my < 0-extra_height
2895        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2896        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2897         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2898             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2899         emu=1;
2900     }
2901
2902     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2903     if(!square){
2904         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2905     }
2906
2907     if(s->flags&CODEC_FLAG_GRAY) return;
2908
2909     if(MB_MBAFF){
2910         // chroma offset when predicting from a field of opposite parity
2911         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2912         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2913     }
2914     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2915     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2916
2917     if(emu){
2918         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2919             src_cb= s->edge_emu_buffer;
2920     }
2921     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2922
2923     if(emu){
2924         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2925             src_cr= s->edge_emu_buffer;
2926     }
2927     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2928 }
2929
2930 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2931                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2932                            int x_offset, int y_offset,
2933                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2934                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2935                            int list0, int list1){
2936     MpegEncContext * const s = &h->s;
2937     qpel_mc_func *qpix_op=  qpix_put;
2938     h264_chroma_mc_func chroma_op= chroma_put;
2939
2940     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2941     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2942     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2943     x_offset += 8*s->mb_x;
2944     y_offset += 8*(s->mb_y >> MB_MBAFF);
2945
2946     if(list0){
2947         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2948         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2949                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2950                            qpix_op, chroma_op);
2951
2952         qpix_op=  qpix_avg;
2953         chroma_op= chroma_avg;
2954     }
2955
2956     if(list1){
2957         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2958         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2959                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2960                            qpix_op, chroma_op);
2961     }
2962 }
2963
2964 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2965                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2966                            int x_offset, int y_offset,
2967                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2968                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2969                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2970                            int list0, int list1){
2971     MpegEncContext * const s = &h->s;
2972
2973     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2974     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2975     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2976     x_offset += 8*s->mb_x;
2977     y_offset += 8*(s->mb_y >> MB_MBAFF);
2978
2979     if(list0 && list1){
2980         /* don't optimize for luma-only case, since B-frames usually
2981          * use implicit weights => chroma too. */
2982         uint8_t *tmp_cb = s->obmc_scratchpad;
2983         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2984         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2985         int refn0 = h->ref_cache[0][ scan8[n] ];
2986         int refn1 = h->ref_cache[1][ scan8[n] ];
2987
2988         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2989                     dest_y, dest_cb, dest_cr,
2990                     x_offset, y_offset, qpix_put, chroma_put);
2991         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2992                     tmp_y, tmp_cb, tmp_cr,
2993                     x_offset, y_offset, qpix_put, chroma_put);
2994
2995         if(h->use_weight == 2){
2996             int weight0 = h->implicit_weight[refn0][refn1];
2997             int weight1 = 64 - weight0;
2998             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2999             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
3000             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
3001         }else{
3002             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
3003                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
3004                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
3005             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3006                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
3007                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
3008             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3009                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
3010                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
3011         }
3012     }else{
3013         int list = list1 ? 1 : 0;
3014         int refn = h->ref_cache[list][ scan8[n] ];
3015         Picture *ref= &h->ref_list[list][refn];
3016         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
3017                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
3018                     qpix_put, chroma_put);
3019
3020         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
3021                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
3022         if(h->use_weight_chroma){
3023             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3024                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
3025             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3026                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
3027         }
3028     }
3029 }
3030
3031 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
3032                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3033                            int x_offset, int y_offset,
3034                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
3035                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
3036                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
3037                            int list0, int list1){
3038     if((h->use_weight==2 && list0 && list1
3039         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
3040        || h->use_weight==1)
3041         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3042                          x_offset, y_offset, qpix_put, chroma_put,
3043                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
3044     else
3045         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3046                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
3047 }
3048
3049 static inline void prefetch_motion(H264Context *h, int list){
3050     /* fetch pixels for estimated mv 4 macroblocks ahead
3051      * optimized for 64byte cache lines */
3052     MpegEncContext * const s = &h->s;
3053     const int refn = h->ref_cache[list][scan8[0]];
3054     if(refn >= 0){
3055         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
3056         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
3057         uint8_t **src= h->ref_list[list][refn].data;
3058         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
3059         s->dsp.prefetch(src[0]+off, s->linesize, 4);
3060         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
3061         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
3062     }
3063 }
3064
3065 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3066                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
3067                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
3068                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
3069     MpegEncContext * const s = &h->s;
3070     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3071     const int mb_type= s->current_picture.mb_type[mb_xy];
3072
3073     assert(IS_INTER(mb_type));
3074
3075     prefetch_motion(h, 0);
3076
3077     if(IS_16X16(mb_type)){
3078         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
3079                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
3080                 &weight_op[0], &weight_avg[0],
3081                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3082     }else if(IS_16X8(mb_type)){
3083         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
3084                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3085                 &weight_op[1], &weight_avg[1],
3086                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3087         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
3088                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3089                 &weight_op[1], &weight_avg[1],
3090                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3091     }else if(IS_8X16(mb_type)){
3092         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
3093                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3094                 &weight_op[2], &weight_avg[2],
3095                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3096         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
3097                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3098                 &weight_op[2], &weight_avg[2],
3099                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3100     }else{
3101         int i;
3102
3103         assert(IS_8X8(mb_type));
3104
3105         for(i=0; i<4; i++){
3106             const int sub_mb_type= h->sub_mb_type[i];
3107             const int n= 4*i;
3108             int x_offset= (i&1)<<2;
3109             int y_offset= (i&2)<<1;
3110
3111             if(IS_SUB_8X8(sub_mb_type)){
3112                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3113                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3114                     &weight_op[3], &weight_avg[3],
3115                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3116             }else if(IS_SUB_8X4(sub_mb_type)){
3117                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3118                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3119                     &weight_op[4], &weight_avg[4],
3120                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3121                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3122                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3123                     &weight_op[4], &weight_avg[4],
3124                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3125             }else if(IS_SUB_4X8(sub_mb_type)){
3126                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3127                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3128                     &weight_op[5], &weight_avg[5],
3129                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3130                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3131                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3132                     &weight_op[5], &weight_avg[5],
3133                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3134             }else{
3135                 int j;
3136                 assert(IS_SUB_4X4(sub_mb_type));
3137                 for(j=0; j<4; j++){
3138                     int sub_x_offset= x_offset + 2*(j&1);
3139                     int sub_y_offset= y_offset +   (j&2);
3140                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3141                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3142                         &weight_op[6], &weight_avg[6],
3143                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3144                 }
3145             }
3146         }
3147     }
3148
3149     prefetch_motion(h, 1);
3150 }
3151
3152 static void decode_init_vlc(H264Context *h){
3153     static int done = 0;
3154
3155     if (!done) {
3156         int i;
3157         done = 1;
3158
3159         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3160                  &chroma_dc_coeff_token_len [0], 1, 1,
3161                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3162
3163         for(i=0; i<4; i++){
3164             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3165                      &coeff_token_len [i][0], 1, 1,
3166                      &coeff_token_bits[i][0], 1, 1, 1);
3167         }
3168
3169         for(i=0; i<3; i++){
3170             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3171                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3172                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3173         }
3174         for(i=0; i<15; i++){
3175             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3176                      &total_zeros_len [i][0], 1, 1,
3177                      &total_zeros_bits[i][0], 1, 1, 1);
3178         }
3179
3180         for(i=0; i<6; i++){
3181             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3182                      &run_len [i][0], 1, 1,
3183                      &run_bits[i][0], 1, 1, 1);
3184         }
3185         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3186                  &run_len [6][0], 1, 1,
3187                  &run_bits[6][0], 1, 1, 1);
3188     }
3189 }
3190
3191 /**
3192  * Sets the intra prediction function pointers.
3193  */
3194 static void init_pred_ptrs(H264Context *h){
3195 //    MpegEncContext * const s = &h->s;
3196
3197     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3198     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3199     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3200     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3201     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3202     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3203     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3204     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3205     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3206     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3207     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3208     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3209
3210     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3211     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3212     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3213     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3214     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3215     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3216     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3217     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3218     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3219     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3220     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3221     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3222
3223     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
3224     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
3225     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
3226     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
3227     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3228     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3229     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
3230
3231     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
3232     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
3233     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
3234     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
3235     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3236     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3237     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
3238 }
3239
3240 static void free_tables(H264Context *h){
3241     av_freep(&h->intra4x4_pred_mode);
3242     av_freep(&h->chroma_pred_mode_table);
3243     av_freep(&h->cbp_table);
3244     av_freep(&h->mvd_table[0]);
3245     av_freep(&h->mvd_table[1]);
3246     av_freep(&h->direct_table);
3247     av_freep(&h->non_zero_count);
3248     av_freep(&h->slice_table_base);
3249     av_freep(&h->top_borders[1]);
3250     av_freep(&h->top_borders[0]);
3251     h->slice_table= NULL;
3252
3253     av_freep(&h->mb2b_xy);
3254     av_freep(&h->mb2b8_xy);
3255
3256     av_freep(&h->s.obmc_scratchpad);
3257 }
3258
3259 static void init_dequant8_coeff_table(H264Context *h){
3260     int i,q,x;
3261     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3262     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3263     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3264
3265     for(i=0; i<2; i++ ){
3266         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3267             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3268             break;
3269         }
3270
3271         for(q=0; q<52; q++){
3272             int shift = div6[q];
3273             int idx = rem6[q];
3274             for(x=0; x<64; x++)
3275                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3276                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3277                     h->pps.scaling_matrix8[i][x]) << shift;
3278         }
3279     }
3280 }
3281
3282 static void init_dequant4_coeff_table(H264Context *h){
3283     int i,j,q,x;
3284     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3285     for(i=0; i<6; i++ ){
3286         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3287         for(j=0; j<i; j++){
3288             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3289                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3290                 break;
3291             }
3292         }
3293         if(j<i)
3294             continue;
3295
3296         for(q=0; q<52; q++){
3297             int shift = div6[q] + 2;
3298             int idx = rem6[q];
3299             for(x=0; x<16; x++)
3300                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3301                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3302                     h->pps.scaling_matrix4[i][x]) << shift;
3303         }
3304     }
3305 }
3306
3307 static void init_dequant_tables(H264Context *h){
3308     int i,x;
3309     init_dequant4_coeff_table(h);
3310     if(h->pps.transform_8x8_mode)
3311         init_dequant8_coeff_table(h);
3312     if(h->sps.transform_bypass){
3313         for(i=0; i<6; i++)
3314             for(x=0; x<16; x++)
3315                 h->dequant4_coeff[i][0][x] = 1<<6;
3316         if(h->pps.transform_8x8_mode)
3317             for(i=0; i<2; i++)
3318                 for(x=0; x<64; x++)
3319                     h->dequant8_coeff[i][0][x] = 1<<6;
3320     }
3321 }
3322
3323
3324 /**
3325  * allocates tables.
3326  * needs width/height
3327  */
3328 static int alloc_tables(H264Context *h){
3329     MpegEncContext * const s = &h->s;
3330     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3331     int x,y;
3332
3333     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3334
3335     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3336     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3337     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3338     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3339     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3340
3341     if( h->pps.cabac ) {
3342         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3343         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3344         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3345         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3346     }
3347
3348     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3349     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3350
3351     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3352     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3353     for(y=0; y<s->mb_height; y++){
3354         for(x=0; x<s->mb_width; x++){
3355             const int mb_xy= x + y*s->mb_stride;
3356             const int b_xy = 4*x + 4*y*h->b_stride;
3357             const int b8_xy= 2*x + 2*y*h->b8_stride;
3358
3359             h->mb2b_xy [mb_xy]= b_xy;
3360             h->mb2b8_xy[mb_xy]= b8_xy;
3361         }
3362     }
3363
3364     s->obmc_scratchpad = NULL;
3365
3366     if(!h->dequant4_coeff[0])
3367         init_dequant_tables(h);
3368
3369     return 0;
3370 fail:
3371     free_tables(h);
3372     return -1;
3373 }
3374
3375 static void common_init(H264Context *h){
3376     MpegEncContext * const s = &h->s;
3377
3378     s->width = s->avctx->width;
3379     s->height = s->avctx->height;
3380     s->codec_id= s->avctx->codec->id;
3381
3382     init_pred_ptrs(h);
3383
3384     h->dequant_coeff_pps= -1;
3385     s->unrestricted_mv=1;
3386     s->decode=1; //FIXME
3387
3388     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3389     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3390 }
3391
3392 static int decode_init(AVCodecContext *avctx){
3393     H264Context *h= avctx->priv_data;
3394     MpegEncContext * const s = &h->s;
3395
3396     MPV_decode_defaults(s);
3397
3398     s->avctx = avctx;
3399     common_init(h);
3400
3401     s->out_format = FMT_H264;
3402     s->workaround_bugs= avctx->workaround_bugs;
3403
3404     // set defaults
3405 //    s->decode_mb= ff_h263_decode_mb;
3406     s->low_delay= 1;
3407     avctx->pix_fmt= PIX_FMT_YUV420P;
3408
3409     decode_init_vlc(h);
3410
3411     if(avctx->extradata_size > 0 && avctx->extradata &&
3412        *(char *)avctx->extradata == 1){
3413         h->is_avc = 1;
3414         h->got_avcC = 0;
3415     } else {
3416         h->is_avc = 0;
3417     }
3418
3419     return 0;
3420 }
3421
3422 static int frame_start(H264Context *h){
3423     MpegEncContext * const s = &h->s;
3424     int i;
3425
3426     if(MPV_frame_start(s, s->avctx) < 0)
3427         return -1;
3428     ff_er_frame_start(s);
3429
3430     assert(s->linesize && s->uvlinesize);
3431
3432     for(i=0; i<16; i++){
3433         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3434         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3435     }
3436     for(i=0; i<4; i++){
3437         h->block_offset[16+i]=
3438         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3439         h->block_offset[24+16+i]=
3440         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3441     }
3442
3443     /* can't be in alloc_tables because linesize isn't known there.
3444      * FIXME: redo bipred weight to not require extra buffer? */
3445     if(!s->obmc_scratchpad)
3446         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3447
3448     /* some macroblocks will be accessed before they're available */
3449     if(FRAME_MBAFF)
3450         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3451
3452 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3453     return 0;
3454 }
3455
3456 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3457     MpegEncContext * const s = &h->s;
3458     int i;
3459
3460     src_y  -=   linesize;
3461     src_cb -= uvlinesize;
3462     src_cr -= uvlinesize;
3463
3464     // There are two lines saved, the line above the the top macroblock of a pair,
3465     // and the line above the bottom macroblock
3466     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3467     for(i=1; i<17; i++){
3468         h->left_border[i]= src_y[15+i*  linesize];
3469     }
3470
3471     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3472     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3473
3474     if(!(s->flags&CODEC_FLAG_GRAY)){
3475         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3476         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3477         for(i=1; i<9; i++){
3478             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3479             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3480         }
3481         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3482         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3483     }
3484 }
3485
3486 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3487     MpegEncContext * const s = &h->s;
3488     int temp8, i;
3489     uint64_t temp64;
3490     int deblock_left = (s->mb_x > 0);
3491     int deblock_top  = (s->mb_y > 0);
3492
3493     src_y  -=   linesize + 1;
3494     src_cb -= uvlinesize + 1;
3495     src_cr -= uvlinesize + 1;
3496
3497 #define XCHG(a,b,t,xchg)\
3498 t= a;\
3499 if(xchg)\
3500     a= b;\
3501 b= t;
3502
3503     if(deblock_left){
3504         for(i = !deblock_top; i<17; i++){
3505             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3506         }
3507     }
3508
3509     if(deblock_top){
3510         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3511         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3512         if(s->mb_x+1 < s->mb_width){
3513             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3514         }
3515     }
3516
3517     if(!(s->flags&CODEC_FLAG_GRAY)){
3518         if(deblock_left){
3519             for(i = !deblock_top; i<9; i++){
3520                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3521                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3522             }
3523         }
3524         if(deblock_top){
3525             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3526             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3527         }
3528     }
3529 }
3530
3531 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3532     MpegEncContext * const s = &h->s;
3533     int i;
3534
3535     src_y  -= 2 *   linesize;
3536     src_cb -= 2 * uvlinesize;
3537     src_cr -= 2 * uvlinesize;
3538
3539     // There are two lines saved, the line above the the top macroblock of a pair,
3540     // and the line above the bottom macroblock
3541     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3542     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3543     for(i=2; i<34; i++){
3544         h->left_border[i]= src_y[15+i*  linesize];
3545     }
3546
3547     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3548     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3549     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3550     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3551
3552     if(!(s->flags&CODEC_FLAG_GRAY)){
3553         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3554         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3555         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3556         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3557         for(i=2; i<18; i++){
3558             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3559             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3560         }
3561         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3562         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3563         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3564         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3565     }
3566 }
3567
3568 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3569     MpegEncContext * const s = &h->s;
3570     int temp8, i;
3571     uint64_t temp64;
3572     int deblock_left = (s->mb_x > 0);
3573     int deblock_top  = (s->mb_y > 1);
3574
3575     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3576
3577     src_y  -= 2 *   linesize + 1;
3578     src_cb -= 2 * uvlinesize + 1;
3579     src_cr -= 2 * uvlinesize + 1;
3580
3581 #define XCHG(a,b,t,xchg)\
3582 t= a;\
3583 if(xchg)\
3584     a= b;\
3585 b= t;
3586
3587     if(deblock_left){
3588         for(i = (!deblock_top)<<1; i<34; i++){
3589             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3590         }
3591     }
3592
3593     if(deblock_top){
3594         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3595         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3596         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3597         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3598         if(s->mb_x+1 < s->mb_width){
3599             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3600             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3601         }
3602     }
3603
3604     if(!(s->flags&CODEC_FLAG_GRAY)){
3605         if(deblock_left){
3606             for(i = (!deblock_top) << 1; i<18; i++){
3607                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3608                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3609             }
3610         }
3611         if(deblock_top){
3612             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3613             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3614             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3615             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3616         }
3617     }
3618 }
3619
3620 static void hl_decode_mb(H264Context *h){
3621     MpegEncContext * const s = &h->s;
3622     const int mb_x= s->mb_x;
3623     const int mb_y= s->mb_y;
3624     const int mb_xy= mb_x + mb_y*s->mb_stride;
3625     const int mb_type= s->current_picture.mb_type[mb_xy];
3626     uint8_t  *dest_y, *dest_cb, *dest_cr;
3627     int linesize, uvlinesize /*dct_offset*/;
3628     int i;
3629     int *block_offset = &h->block_offset[0];
3630     const unsigned int bottom = mb_y & 1;
3631     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3632     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3633     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3634
3635     if(!s->decode)
3636         return;
3637
3638     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3639     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3640     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3641
3642     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3643     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3644
3645     if (MB_FIELD) {
3646         linesize   = h->mb_linesize   = s->linesize * 2;
3647         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3648         block_offset = &h->block_offset[24];
3649         if(mb_y&1){ //FIXME move out of this func?
3650             dest_y -= s->linesize*15;
3651             dest_cb-= s->uvlinesize*7;
3652             dest_cr-= s->uvlinesize*7;
3653         }
3654         if(FRAME_MBAFF) {
3655             int list;
3656             for(list=0; list<2; list++){
3657                 if(!USES_LIST(mb_type, list))
3658                     continue;
3659                 if(IS_16X16(mb_type)){
3660                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3661                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3662                 }else{
3663                     for(i=0; i<16; i+=4){
3664                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3665                         int ref = h->ref_cache[list][scan8[i]];
3666                         if(ref >= 0)
3667                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3668                     }
3669                 }
3670             }
3671         }
3672     } else {
3673         linesize   = h->mb_linesize   = s->linesize;
3674         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3675 //        dct_offset = s->linesize * 16;
3676     }
3677
3678     if(transform_bypass){
3679         idct_dc_add =
3680         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3681     }else if(IS_8x8DCT(mb_type)){
3682         idct_dc_add = s->dsp.h264_idct8_dc_add;
3683         idct_add = s->dsp.h264_idct8_add;
3684     }else{
3685         idct_dc_add = s->dsp.h264_idct_dc_add;
3686         idct_add = s->dsp.h264_idct_add;
3687     }
3688
3689     if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3690        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3691         int mbt_y = mb_y&~1;
3692         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3693         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3694         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3695         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3696     }
3697
3698     if (IS_INTRA_PCM(mb_type)) {
3699         unsigned int x, y;
3700
3701         // The pixels are stored in h->mb array in the same order as levels,
3702         // copy them in output in the correct order.
3703         for(i=0; i<16; i++) {
3704             for (y=0; y<4; y++) {
3705                 for (x=0; x<4; x++) {
3706                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3707                 }
3708             }
3709         }
3710         for(i=16; i<16+4; i++) {
3711             for (y=0; y<4; y++) {
3712                 for (x=0; x<4; x++) {
3713                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3714                 }
3715             }
3716         }
3717         for(i=20; i<20+4; i++) {
3718             for (y=0; y<4; y++) {
3719                 for (x=0; x<4; x++) {
3720                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3721                 }
3722             }
3723         }
3724     } else {
3725         if(IS_INTRA(mb_type)){
3726             if(h->deblocking_filter && !FRAME_MBAFF)
3727                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3728
3729             if(!(s->flags&CODEC_FLAG_GRAY)){
3730                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3731                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3732             }
3733
3734             if(IS_INTRA4x4(mb_type)){
3735                 if(!s->encoding){
3736                     if(IS_8x8DCT(mb_type)){
3737                         for(i=0; i<16; i+=4){
3738                             uint8_t * const ptr= dest_y + block_offset[i];
3739                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3740                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3741                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3742                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3743                             if(nnz){
3744                                 if(nnz == 1 && h->mb[i*16])
3745                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3746                                 else
3747                                     idct_add(ptr, h->mb + i*16, linesize);
3748                             }
3749                         }
3750                     }else
3751                     for(i=0; i<16; i++){
3752                         uint8_t * const ptr= dest_y + block_offset[i];
3753                         uint8_t *topright;
3754                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3755                         int nnz, tr;
3756
3757                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3758                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3759                             assert(mb_y || linesize <= block_offset[i]);
3760                             if(!topright_avail){
3761                                 tr= ptr[3 - linesize]*0x01010101;
3762                                 topright= (uint8_t*) &tr;
3763                             }else
3764                                 topright= ptr + 4 - linesize;
3765                         }else
3766                             topright= NULL;
3767
3768                         h->pred4x4[ dir ](ptr, topright, linesize);
3769                         nnz = h->non_zero_count_cache[ scan8[i] ];
3770                         if(nnz){
3771                             if(s->codec_id == CODEC_ID_H264){
3772                                 if(nnz == 1 && h->mb[i*16])
3773                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3774                                 else
3775                                     idct_add(ptr, h->mb + i*16, linesize);
3776                             }else
3777                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3778                         }
3779                     }
3780                 }
3781             }else{
3782                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3783                 if(s->codec_id == CODEC_ID_H264){
3784                     if(!transform_bypass)
3785                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3786                 }else
3787                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3788             }
3789             if(h->deblocking_filter && !FRAME_MBAFF)
3790                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3791         }else if(s->codec_id == CODEC_ID_H264){
3792             hl_motion(h, dest_y, dest_cb, dest_cr,
3793                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3794                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3795                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3796         }
3797
3798
3799         if(!IS_INTRA4x4(mb_type)){
3800             if(s->codec_id == CODEC_ID_H264){
3801                 if(IS_INTRA16x16(mb_type)){
3802                     for(i=0; i<16; i++){
3803                         if(h->non_zero_count_cache[ scan8[i] ])
3804                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3805                         else if(h->mb[i*16])
3806                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3807                     }
3808                 }else{
3809                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3810                     for(i=0; i<16; i+=di){
3811                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3812                         if(nnz){
3813                             if(nnz==1 && h->mb[i*16])
3814                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3815                             else
3816                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3817                         }
3818                     }
3819                 }
3820             }else{
3821                 for(i=0; i<16; i++){
3822                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3823                         uint8_t * const ptr= dest_y + block_offset[i];
3824                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3825                     }
3826                 }
3827             }
3828         }
3829
3830         if(!(s->flags&CODEC_FLAG_GRAY)){
3831             uint8_t *dest[2] = {dest_cb, dest_cr};
3832             if(transform_bypass){
3833                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3834             }else{
3835                 idct_add = s->dsp.h264_idct_add;
3836                 idct_dc_add = s->dsp.h264_idct_dc_add;
3837                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3838                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3839             }
3840             if(s->codec_id == CODEC_ID_H264){
3841                 for(i=16; i<16+8; i++){
3842                     if(h->non_zero_count_cache[ scan8[i] ])
3843                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3844                     else if(h->mb[i*16])
3845                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3846                 }
3847             }else{
3848                 for(i=16; i<16+8; i++){
3849                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3850                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3851                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3852                     }
3853                 }
3854             }
3855         }
3856     }
3857     if(h->deblocking_filter) {
3858         if (FRAME_MBAFF) {
3859             //FIXME try deblocking one mb at a time?
3860             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3861             const int mb_y = s->mb_y - 1;
3862             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3863             const int mb_xy= mb_x + mb_y*s->mb_stride;
3864             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3865             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3866             if (!bottom) return;
3867             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3868             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3869             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3870
3871             if(IS_INTRA(mb_type_top | mb_type_bottom))
3872                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3873
3874             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3875             // deblock a pair
3876             // top
3877             s->mb_y--;
3878             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3879             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3880             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3881             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3882             // bottom
3883             s->mb_y++;
3884             tprintf("call mbaff filter_mb\n");
3885             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3886             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3887             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3888         } else {
3889             tprintf("call filter_mb\n");
3890             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3891             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3892             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3893         }
3894     }
3895 }
3896
3897 /**
3898  * fills the default_ref_list.
3899  */
3900 static int fill_default_ref_list(H264Context *h){
3901     MpegEncContext * const s = &h->s;
3902     int i;
3903     int smallest_poc_greater_than_current = -1;
3904     Picture sorted_short_ref[32];
3905
3906     if(h->slice_type==B_TYPE){
3907         int out_i;
3908         int limit= INT_MIN;
3909
3910         /* sort frame according to poc in B slice */
3911         for(out_i=0; out_i<h->short_ref_count; out_i++){
3912             int best_i=INT_MIN;
3913             int best_poc=INT_MAX;
3914
3915             for(i=0; i<h->short_ref_count; i++){
3916                 const int poc= h->short_ref[i]->poc;
3917                 if(poc > limit && poc < best_poc){
3918                     best_poc= poc;
3919                     best_i= i;
3920                 }
3921             }
3922
3923             assert(best_i != INT_MIN);
3924
3925             limit= best_poc;
3926             sorted_short_ref[out_i]= *h->short_ref[best_i];
3927             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3928             if (-1 == smallest_poc_greater_than_current) {
3929                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3930                     smallest_poc_greater_than_current = out_i;
3931                 }
3932             }
3933         }
3934     }
3935
3936     if(s->picture_structure == PICT_FRAME){
3937         if(h->slice_type==B_TYPE){
3938             int list;
3939             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3940
3941             // find the largest poc
3942             for(list=0; list<2; list++){
3943                 int index = 0;
3944                 int j= -99;
3945                 int step= list ? -1 : 1;
3946
3947                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3948                     while(j<0 || j>= h->short_ref_count){
3949                         if(j != -99 && step == (list ? -1 : 1))
3950                             return -1;
3951                         step = -step;
3952                         j= smallest_poc_greater_than_current + (step>>1);
3953                     }
3954                     if(sorted_short_ref[j].reference != 3) continue;
3955                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3956                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3957                 }
3958
3959                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3960                     if(h->long_ref[i] == NULL) continue;
3961                     if(h->long_ref[i]->reference != 3) continue;
3962
3963                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3964                     h->default_ref_list[ list ][index++].pic_id= i;;
3965                 }
3966
3967                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3968                     // swap the two first elements of L1 when
3969                     // L0 and L1 are identical
3970                     Picture temp= h->default_ref_list[1][0];
3971                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3972                     h->default_ref_list[1][1] = temp;
3973                 }
3974
3975                 if(index < h->ref_count[ list ])
3976                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3977             }
3978         }else{
3979             int index=0;
3980             for(i=0; i<h->short_ref_count; i++){
3981                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3982                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3983                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3984             }
3985             for(i = 0; i < 16; i++){
3986                 if(h->long_ref[i] == NULL) continue;
3987                 if(h->long_ref[i]->reference != 3) continue;
3988                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3989                 h->default_ref_list[0][index++].pic_id= i;;
3990             }
3991             if(index < h->ref_count[0])
3992                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3993         }
3994     }else{ //FIELD
3995         if(h->slice_type==B_TYPE){
3996         }else{
3997             //FIXME second field balh
3998         }
3999     }
4000 #ifdef TRACE
4001     for (i=0; i<h->ref_count[0]; i++) {
4002         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
4003     }
4004     if(h->slice_type==B_TYPE){
4005         for (i=0; i<h->ref_count[1]; i++) {
4006             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
4007         }
4008     }
4009 #endif
4010     return 0;
4011 }
4012
4013 static void print_short_term(H264Context *h);
4014 static void print_long_term(H264Context *h);
4015
4016 static int decode_ref_pic_list_reordering(H264Context *h){
4017     MpegEncContext * const s = &h->s;
4018     int list, index;
4019
4020     print_short_term(h);
4021     print_long_term(h);
4022     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
4023
4024     for(list=0; list<2; list++){
4025         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
4026
4027         if(get_bits1(&s->gb)){
4028             int pred= h->curr_pic_num;
4029
4030             for(index=0; ; index++){
4031                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
4032                 int pic_id;
4033                 int i;
4034                 Picture *ref = NULL;
4035
4036                 if(reordering_of_pic_nums_idc==3)
4037                     break;
4038
4039                 if(index >= h->ref_count[list]){
4040                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
4041                     return -1;
4042                 }
4043
4044                 if(reordering_of_pic_nums_idc<3){
4045                     if(reordering_of_pic_nums_idc<2){
4046                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
4047
4048                         if(abs_diff_pic_num >= h->max_pic_num){
4049                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
4050                             return -1;
4051                         }
4052
4053                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
4054                         else                                pred+= abs_diff_pic_num;
4055                         pred &= h->max_pic_num - 1;
4056
4057                         for(i= h->short_ref_count-1; i>=0; i--){
4058                             ref = h->short_ref[i];
4059                             assert(ref->reference == 3);
4060                             assert(!ref->long_ref);
4061                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
4062                                 break;
4063                         }
4064                         if(i>=0)
4065                             ref->pic_id= ref->frame_num;
4066                     }else{
4067                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
4068                         ref = h->long_ref[pic_id];
4069                         ref->pic_id= pic_id;
4070                         assert(ref->reference == 3);
4071                         assert(ref->long_ref);
4072                         i=0;
4073                     }
4074
4075                     if (i < 0) {
4076                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
4077                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
4078                     } else {
4079                         for(i=index; i+1<h->ref_count[list]; i++){
4080                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
4081                                 break;
4082                         }
4083                         for(; i > index; i--){
4084                             h->ref_list[list][i]= h->ref_list[list][i-1];
4085                         }
4086                         h->ref_list[list][index]= *ref;
4087                     }
4088                 }else{
4089                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4090                     return -1;
4091                 }
4092             }
4093         }
4094
4095         if(h->slice_type!=B_TYPE) break;
4096     }
4097     for(list=0; list<2; list++){
4098         for(index= 0; index < h->ref_count[list]; index++){
4099             if(!h->ref_list[list][index].data[0])
4100                 h->ref_list[list][index]= s->current_picture;
4101         }
4102         if(h->slice_type!=B_TYPE) break;
4103     }
4104
4105     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4106         direct_dist_scale_factor(h);
4107     direct_ref_list_init(h);
4108     return 0;
4109 }
4110
4111 static void fill_mbaff_ref_list(H264Context *h){
4112     int list, i, j;
4113     for(list=0; list<2; list++){
4114         for(i=0; i<h->ref_count[list]; i++){
4115             Picture *frame = &h->ref_list[list][i];
4116             Picture *field = &h->ref_list[list][16+2*i];
4117             field[0] = *frame;
4118             for(j=0; j<3; j++)
4119                 field[0].linesize[j] <<= 1;
4120             field[1] = field[0];
4121             for(j=0; j<3; j++)
4122                 field[1].data[j] += frame->linesize[j];
4123
4124             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4125             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4126             for(j=0; j<2; j++){
4127                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4128                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4129             }
4130         }
4131     }
4132     for(j=0; j<h->ref_count[1]; j++){
4133         for(i=0; i<h->ref_count[0]; i++)
4134             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4135         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4136         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4137     }
4138 }
4139
4140 static int pred_weight_table(H264Context *h){
4141     MpegEncContext * const s = &h->s;
4142     int list, i;
4143     int luma_def, chroma_def;
4144
4145     h->use_weight= 0;
4146     h->use_weight_chroma= 0;
4147     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4148     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4149     luma_def = 1<<h->luma_log2_weight_denom;
4150     chroma_def = 1<<h->chroma_log2_weight_denom;
4151
4152     for(list=0; list<2; list++){
4153         for(i=0; i<h->ref_count[list]; i++){
4154             int luma_weight_flag, chroma_weight_flag;
4155
4156             luma_weight_flag= get_bits1(&s->gb);
4157             if(luma_weight_flag){
4158                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4159                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4160                 if(   h->luma_weight[list][i] != luma_def
4161                    || h->luma_offset[list][i] != 0)
4162                     h->use_weight= 1;
4163             }else{
4164                 h->luma_weight[list][i]= luma_def;
4165                 h->luma_offset[list][i]= 0;
4166             }
4167
4168             chroma_weight_flag= get_bits1(&s->gb);
4169             if(chroma_weight_flag){
4170                 int j;
4171                 for(j=0; j<2; j++){
4172                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4173                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4174                     if(   h->chroma_weight[list][i][j] != chroma_def
4175                        || h->chroma_offset[list][i][j] != 0)
4176                         h->use_weight_chroma= 1;
4177                 }
4178             }else{
4179                 int j;
4180                 for(j=0; j<2; j++){
4181                     h->chroma_weight[list][i][j]= chroma_def;
4182                     h->chroma_offset[list][i][j]= 0;
4183                 }
4184             }
4185         }
4186         if(h->slice_type != B_TYPE) break;
4187     }
4188     h->use_weight= h->use_weight || h->use_weight_chroma;
4189     return 0;
4190 }
4191
4192 static void implicit_weight_table(H264Context *h){
4193     MpegEncContext * const s = &h->s;
4194     int ref0, ref1;
4195     int cur_poc = s->current_picture_ptr->poc;
4196
4197     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4198        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4199         h->use_weight= 0;
4200         h->use_weight_chroma= 0;
4201         return;
4202     }
4203
4204     h->use_weight= 2;
4205     h->use_weight_chroma= 2;
4206     h->luma_log2_weight_denom= 5;
4207     h->chroma_log2_weight_denom= 5;
4208
4209     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4210         int poc0 = h->ref_list[0][ref0].poc;
4211         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4212             int poc1 = h->ref_list[1][ref1].poc;
4213             int td = clip(poc1 - poc0, -128, 127);
4214             if(td){
4215                 int tb = clip(cur_poc - poc0, -128, 127);
4216                 int tx = (16384 + (FFABS(td) >> 1)) / td;
4217                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4218                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4219                     h->implicit_weight[ref0][ref1] = 32;
4220                 else
4221                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4222             }else
4223                 h->implicit_weight[ref0][ref1] = 32;
4224         }
4225     }
4226 }
4227
4228 static inline void unreference_pic(H264Context *h, Picture *pic){
4229     int i;
4230     pic->reference=0;
4231     if(pic == h->delayed_output_pic)
4232         pic->reference=1;
4233     else{
4234         for(i = 0; h->delayed_pic[i]; i++)
4235             if(pic == h->delayed_pic[i]){
4236                 pic->reference=1;
4237                 break;
4238             }
4239     }
4240 }
4241
4242 /**
4243  * instantaneous decoder refresh.
4244  */
4245 static void idr(H264Context *h){
4246     int i;
4247
4248     for(i=0; i<16; i++){
4249         if (h->long_ref[i] != NULL) {
4250             unreference_pic(h, h->long_ref[i]);
4251             h->long_ref[i]= NULL;
4252         }
4253     }
4254     h->long_ref_count=0;
4255
4256     for(i=0; i<h->short_ref_count; i++){
4257         unreference_pic(h, h->short_ref[i]);
4258         h->short_ref[i]= NULL;
4259     }
4260     h->short_ref_count=0;
4261 }
4262
4263 /* forget old pics after a seek */
4264 static void flush_dpb(AVCodecContext *avctx){
4265     H264Context *h= avctx->priv_data;
4266     int i;
4267     for(i=0; i<16; i++) {
4268         if(h->delayed_pic[i])
4269             h->delayed_pic[i]->reference= 0;
4270         h->delayed_pic[i]= NULL;
4271     }
4272     if(h->delayed_output_pic)
4273         h->delayed_output_pic->reference= 0;
4274     h->delayed_output_pic= NULL;
4275     idr(h);
4276     if(h->s.current_picture_ptr)
4277         h->s.current_picture_ptr->reference= 0;
4278 }
4279
4280 /**
4281  *
4282  * @return the removed picture or NULL if an error occurs
4283  */
4284 static Picture * remove_short(H264Context *h, int frame_num){
4285     MpegEncContext * const s = &h->s;
4286     int i;
4287
4288     if(s->avctx->debug&FF_DEBUG_MMCO)
4289         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4290
4291     for(i=0; i<h->short_ref_count; i++){
4292         Picture *pic= h->short_ref[i];
4293         if(s->avctx->debug&FF_DEBUG_MMCO)
4294             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4295         if(pic->frame_num == frame_num){
4296             h->short_ref[i]= NULL;
4297             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4298             h->short_ref_count--;
4299             return pic;
4300         }
4301     }
4302     return NULL;
4303 }
4304
4305 /**
4306  *
4307  * @return the removed picture or NULL if an error occurs
4308  */
4309 static Picture * remove_long(H264Context *h, int i){
4310     Picture *pic;
4311
4312     pic= h->long_ref[i];
4313     h->long_ref[i]= NULL;
4314     if(pic) h->long_ref_count--;
4315
4316     return pic;
4317 }
4318
4319 /**
4320  * print short term list
4321  */
4322 static void print_short_term(H264Context *h) {
4323     uint32_t i;
4324     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4325         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4326         for(i=0; i<h->short_ref_count; i++){
4327             Picture *pic= h->short_ref[i];
4328             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4329         }
4330     }
4331 }
4332
4333 /**
4334  * print long term list
4335  */
4336 static void print_long_term(H264Context *h) {
4337     uint32_t i;
4338     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4339         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4340         for(i = 0; i < 16; i++){
4341             Picture *pic= h->long_ref[i];
4342             if (pic) {
4343                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4344             }
4345         }
4346     }
4347 }
4348
4349 /**
4350  * Executes the reference picture marking (memory management control operations).
4351  */
4352 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4353     MpegEncContext * const s = &h->s;
4354     int i, j;
4355     int current_is_long=0;
4356     Picture *pic;
4357
4358     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4359         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4360
4361     for(i=0; i<mmco_count; i++){
4362         if(s->avctx->debug&FF_DEBUG_MMCO)
4363             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4364
4365         switch(mmco[i].opcode){
4366         case MMCO_SHORT2UNUSED:
4367             pic= remove_short(h, mmco[i].short_frame_num);
4368             if(pic)
4369                 unreference_pic(h, pic);
4370             else if(s->avctx->debug&FF_DEBUG_MMCO)
4371                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4372             break;
4373         case MMCO_SHORT2LONG:
4374             pic= remove_long(h, mmco[i].long_index);
4375             if(pic) unreference_pic(h, pic);
4376
4377             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4378             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4379             h->long_ref_count++;
4380             break;
4381         case MMCO_LONG2UNUSED:
4382             pic= remove_long(h, mmco[i].long_index);
4383             if(pic)
4384                 unreference_pic(h, pic);
4385             else if(s->avctx->debug&FF_DEBUG_MMCO)
4386                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4387             break;
4388         case MMCO_LONG:
4389             pic= remove_long(h, mmco[i].long_index);
4390             if(pic) unreference_pic(h, pic);
4391
4392             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4393             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4394             h->long_ref_count++;
4395
4396             current_is_long=1;
4397             break;
4398         case MMCO_SET_MAX_LONG:
4399             assert(mmco[i].long_index <= 16);
4400             // just remove the long term which index is greater than new max
4401             for(j = mmco[i].long_index; j<16; j++){
4402                 pic = remove_long(h, j);
4403                 if (pic) unreference_pic(h, pic);
4404             }
4405             break;
4406         case MMCO_RESET:
4407             while(h->short_ref_count){
4408                 pic= remove_short(h, h->short_ref[0]->frame_num);
4409                 unreference_pic(h, pic);
4410             }
4411             for(j = 0; j < 16; j++) {
4412                 pic= remove_long(h, j);
4413                 if(pic) unreference_pic(h, pic);
4414             }
4415             break;
4416         default: assert(0);
4417         }
4418     }
4419
4420     if(!current_is_long){
4421         pic= remove_short(h, s->current_picture_ptr->frame_num);
4422         if(pic){
4423             unreference_pic(h, pic);
4424             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4425         }
4426
4427         if(h->short_ref_count)
4428             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4429
4430         h->short_ref[0]= s->current_picture_ptr;
4431         h->short_ref[0]->long_ref=0;
4432         h->short_ref_count++;
4433     }
4434
4435     print_short_term(h);
4436     print_long_term(h);
4437     return 0;
4438 }
4439
4440 static int decode_ref_pic_marking(H264Context *h){
4441     MpegEncContext * const s = &h->s;
4442     int i;
4443
4444     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4445         s->broken_link= get_bits1(&s->gb) -1;
4446         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4447         if(h->mmco[0].long_index == -1)
4448             h->mmco_index= 0;
4449         else{
4450             h->mmco[0].opcode= MMCO_LONG;
4451             h->mmco_index= 1;
4452         }
4453     }else{
4454         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4455             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4456                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4457
4458                 h->mmco[i].opcode= opcode;
4459                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4460                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4461 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4462                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4463                         return -1;
4464                     }*/
4465                 }
4466                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4467                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4468                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4469                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4470                         return -1;
4471                     }
4472                 }
4473
4474                 if(opcode > MMCO_LONG){
4475                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4476                     return -1;
4477                 }
4478                 if(opcode == MMCO_END)
4479                     break;
4480             }
4481             h->mmco_index= i;
4482         }else{
4483             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4484
4485             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4486                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4487                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4488                 h->mmco_index= 1;
4489             }else
4490                 h->mmco_index= 0;
4491         }
4492     }
4493
4494     return 0;
4495 }
4496
4497 static int init_poc(H264Context *h){
4498     MpegEncContext * const s = &h->s;
4499     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4500     int field_poc[2];
4501
4502     if(h->nal_unit_type == NAL_IDR_SLICE){
4503         h->frame_num_offset= 0;
4504     }else{
4505         if(h->frame_num < h->prev_frame_num)
4506             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4507         else
4508             h->frame_num_offset= h->prev_frame_num_offset;
4509     }
4510
4511     if(h->sps.poc_type==0){
4512         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4513
4514         if(h->nal_unit_type == NAL_IDR_SLICE){
4515              h->prev_poc_msb=
4516              h->prev_poc_lsb= 0;
4517         }
4518
4519         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4520             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4521         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4522             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4523         else
4524             h->poc_msb = h->prev_poc_msb;
4525 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4526         field_poc[0] =
4527         field_poc[1] = h->poc_msb + h->poc_lsb;
4528         if(s->picture_structure == PICT_FRAME)
4529             field_poc[1] += h->delta_poc_bottom;
4530     }else if(h->sps.poc_type==1){
4531         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4532         int i;
4533
4534         if(h->sps.poc_cycle_length != 0)
4535             abs_frame_num = h->frame_num_offset + h->frame_num;
4536         else
4537             abs_frame_num = 0;
4538
4539         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4540             abs_frame_num--;
4541
4542         expected_delta_per_poc_cycle = 0;
4543         for(i=0; i < h->sps.poc_cycle_length; i++)
4544             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4545
4546         if(abs_frame_num > 0){
4547             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4548             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4549
4550             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4551             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4552                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4553         } else
4554             expectedpoc = 0;
4555
4556         if(h->nal_ref_idc == 0)
4557             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4558
4559         field_poc[0] = expectedpoc + h->delta_poc[0];
4560         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4561
4562         if(s->picture_structure == PICT_FRAME)
4563             field_poc[1] += h->delta_poc[1];
4564     }else{
4565         int poc;
4566         if(h->nal_unit_type == NAL_IDR_SLICE){
4567             poc= 0;
4568         }else{
4569             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4570             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4571         }
4572         field_poc[0]= poc;
4573         field_poc[1]= poc;
4574     }
4575
4576     if(s->picture_structure != PICT_BOTTOM_FIELD)
4577         s->current_picture_ptr->field_poc[0]= field_poc[0];
4578     if(s->picture_structure != PICT_TOP_FIELD)
4579         s->current_picture_ptr->field_poc[1]= field_poc[1];
4580     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4581         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4582
4583     return 0;
4584 }
4585
4586 /**
4587  * decodes a slice header.
4588  * this will allso call MPV_common_init() and frame_start() as needed
4589  */
4590 static int decode_slice_header(H264Context *h){
4591     MpegEncContext * const s = &h->s;
4592     int first_mb_in_slice, pps_id;
4593     int num_ref_idx_active_override_flag;
4594     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4595     int slice_type;
4596     int default_ref_list_done = 0;
4597
4598     s->current_picture.reference= h->nal_ref_idc != 0;
4599     s->dropable= h->nal_ref_idc == 0;
4600
4601     first_mb_in_slice= get_ue_golomb(&s->gb);
4602
4603     slice_type= get_ue_golomb(&s->gb);
4604     if(slice_type > 9){
4605         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4606         return -1;
4607     }
4608     if(slice_type > 4){
4609         slice_type -= 5;
4610         h->slice_type_fixed=1;
4611     }else
4612         h->slice_type_fixed=0;
4613
4614     slice_type= slice_type_map[ slice_type ];
4615     if (slice_type == I_TYPE
4616         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4617         default_ref_list_done = 1;
4618     }
4619     h->slice_type= slice_type;
4620
4621     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4622
4623     pps_id= get_ue_golomb(&s->gb);
4624     if(pps_id>255){
4625         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4626         return -1;
4627     }
4628     h->pps= h->pps_buffer[pps_id];
4629     if(h->pps.slice_group_count == 0){
4630         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4631         return -1;
4632     }
4633
4634     h->sps= h->sps_buffer[ h->pps.sps_id ];
4635     if(h->sps.log2_max_frame_num == 0){
4636         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4637         return -1;
4638     }
4639
4640     if(h->dequant_coeff_pps != pps_id){
4641         h->dequant_coeff_pps = pps_id;
4642         init_dequant_tables(h);
4643     }
4644
4645     s->mb_width= h->sps.mb_width;
4646     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4647
4648     h->b_stride=  s->mb_width*4;
4649     h->b8_stride= s->mb_width*2;
4650
4651     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4652     if(h->sps.frame_mbs_only_flag)
4653         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4654     else
4655         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4656
4657     if (s->context_initialized
4658         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4659         free_tables(h);
4660         MPV_common_end(s);
4661     }
4662     if (!s->context_initialized) {
4663         if (MPV_common_init(s) < 0)
4664             return -1;
4665
4666         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4667             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4668             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4669         }else{
4670             int i;
4671             for(i=0; i<16; i++){
4672 #define T(x) (x>>2) | ((x<<2) & 0xF)
4673                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4674                 h-> field_scan[i] = T( field_scan[i]);
4675 #undef T
4676             }
4677         }
4678         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4679             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4680             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4681             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4682             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4683         }else{
4684             int i;
4685             for(i=0; i<64; i++){
4686 #define T(x) (x>>3) | ((x&7)<<3)
4687                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4688                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4689                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4690                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4691 #undef T
4692             }
4693         }
4694         if(h->sps.transform_bypass){ //FIXME same ugly
4695             h->zigzag_scan_q0          = zigzag_scan;
4696             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4697             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4698             h->field_scan_q0           = field_scan;
4699             h->field_scan8x8_q0        = field_scan8x8;
4700             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4701         }else{
4702             h->zigzag_scan_q0          = h->zigzag_scan;
4703             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4704             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4705             h->field_scan_q0           = h->field_scan;
4706             h->field_scan8x8_q0        = h->field_scan8x8;
4707             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4708         }
4709
4710         alloc_tables(h);
4711
4712         s->avctx->width = s->width;
4713         s->avctx->height = s->height;
4714         s->avctx->sample_aspect_ratio= h->sps.sar;
4715         if(!s->avctx->sample_aspect_ratio.den)
4716             s->avctx->sample_aspect_ratio.den = 1;
4717
4718         if(h->sps.timing_info_present_flag){
4719             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4720             if(h->x264_build > 0 && h->x264_build < 44)
4721                 s->avctx->time_base.den *= 2;
4722             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4723                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4724         }
4725     }
4726
4727     if(h->slice_num == 0){
4728         if(frame_start(h) < 0)
4729             return -1;
4730     }
4731
4732     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4733     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4734
4735     h->mb_mbaff = 0;
4736     h->mb_aff_frame = 0;
4737     if(h->sps.frame_mbs_only_flag){
4738         s->picture_structure= PICT_FRAME;
4739     }else{
4740         if(get_bits1(&s->gb)) { //field_pic_flag
4741             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4742             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4743         } else {
4744             s->picture_structure= PICT_FRAME;
4745             h->mb_aff_frame = h->sps.mb_aff;
4746         }
4747     }
4748
4749     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4750     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4751     if(s->mb_y >= s->mb_height){
4752         return -1;
4753     }
4754
4755     if(s->picture_structure==PICT_FRAME){
4756         h->curr_pic_num=   h->frame_num;
4757         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4758     }else{
4759         h->curr_pic_num= 2*h->frame_num;
4760         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4761     }
4762
4763     if(h->nal_unit_type == NAL_IDR_SLICE){
4764         get_ue_golomb(&s->gb); /* idr_pic_id */
4765     }
4766
4767     if(h->sps.poc_type==0){
4768         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4769
4770         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4771             h->delta_poc_bottom= get_se_golomb(&s->gb);
4772         }
4773     }
4774
4775     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4776         h->delta_poc[0]= get_se_golomb(&s->gb);
4777
4778         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4779             h->delta_poc[1]= get_se_golomb(&s->gb);
4780     }
4781
4782     init_poc(h);
4783
4784     if(h->pps.redundant_pic_cnt_present){
4785         h->redundant_pic_count= get_ue_golomb(&s->gb);
4786     }
4787
4788     //set defaults, might be overriden a few line later
4789     h->ref_count[0]= h->pps.ref_count[0];
4790     h->ref_count[1]= h->pps.ref_count[1];
4791
4792     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4793         if(h->slice_type == B_TYPE){
4794             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4795             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4796                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4797         }
4798         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4799
4800         if(num_ref_idx_active_override_flag){
4801             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4802             if(h->slice_type==B_TYPE)
4803                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4804
4805             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4806                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4807                 return -1;
4808             }
4809         }
4810     }
4811
4812     if(!default_ref_list_done){
4813         fill_default_ref_list(h);
4814     }
4815
4816     if(decode_ref_pic_list_reordering(h) < 0)
4817         return -1;
4818
4819     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4820        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4821         pred_weight_table(h);
4822     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4823         implicit_weight_table(h);
4824     else
4825         h->use_weight = 0;
4826
4827     if(s->current_picture.reference)
4828         decode_ref_pic_marking(h);
4829
4830     if(FRAME_MBAFF)
4831         fill_mbaff_ref_list(h);
4832
4833     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4834         h->cabac_init_idc = get_ue_golomb(&s->gb);
4835
4836     h->last_qscale_diff = 0;
4837     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4838     if(s->qscale<0 || s->qscale>51){
4839         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4840         return -1;
4841     }
4842     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4843     //FIXME qscale / qp ... stuff
4844     if(h->slice_type == SP_TYPE){
4845         get_bits1(&s->gb); /* sp_for_switch_flag */
4846     }
4847     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4848         get_se_golomb(&s->gb); /* slice_qs_delta */
4849     }
4850
4851     h->deblocking_filter = 1;
4852     h->slice_alpha_c0_offset = 0;
4853     h->slice_beta_offset = 0;
4854     if( h->pps.deblocking_filter_parameters_present ) {
4855         h->deblocking_filter= get_ue_golomb(&s->gb);
4856         if(h->deblocking_filter < 2)
4857             h->deblocking_filter^= 1; // 1<->0
4858
4859         if( h->deblocking_filter ) {
4860             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4861             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4862         }
4863     }
4864     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4865        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4866        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4867        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4868         h->deblocking_filter= 0;
4869
4870 #if 0 //FMO
4871     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4872         slice_group_change_cycle= get_bits(&s->gb, ?);
4873 #endif
4874
4875     h->slice_num++;
4876
4877     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4878     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4879
4880     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4881         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4882                h->slice_num,
4883                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4884                first_mb_in_slice,
4885                av_get_pict_type_char(h->slice_type),
4886                pps_id, h->frame_num,
4887                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4888                h->ref_count[0], h->ref_count[1],
4889                s->qscale,
4890                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4891                h->use_weight,
4892                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4893                );
4894     }
4895
4896     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4897         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4898         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4899     }else{
4900         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4901         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4902     }
4903
4904     return 0;
4905 }
4906
4907 /**
4908  *
4909  */
4910 static inline int get_level_prefix(GetBitContext *gb){
4911     unsigned int buf;
4912     int log;
4913
4914     OPEN_READER(re, gb);
4915     UPDATE_CACHE(re, gb);
4916     buf=GET_CACHE(re, gb);
4917
4918     log= 32 - av_log2(buf);
4919 #ifdef TRACE
4920     print_bin(buf>>(32-log), log);
4921     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4922 #endif
4923
4924     LAST_SKIP_BITS(re, gb, log);
4925     CLOSE_READER(re, gb);
4926
4927     return log-1;
4928 }
4929
4930 static inline int get_dct8x8_allowed(H264Context *h){
4931     int i;
4932     for(i=0; i<4; i++){
4933         if(!IS_SUB_8X8(h->sub_mb_type[i])
4934            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4935             return 0;
4936     }
4937     return 1;
4938 }
4939
4940 /**
4941  * decodes a residual block.
4942  * @param n block index
4943  * @param scantable scantable
4944  * @param max_coeff number of coefficients in the block
4945  * @return <0 if an error occured
4946  */
4947 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4948     MpegEncContext * const s = &h->s;
4949     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4950     int level[16];
4951     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4952
4953     //FIXME put trailing_onex into the context
4954
4955     if(n == CHROMA_DC_BLOCK_INDEX){
4956         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4957         total_coeff= coeff_token>>2;
4958     }else{
4959         if(n == LUMA_DC_BLOCK_INDEX){
4960             total_coeff= pred_non_zero_count(h, 0);
4961             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4962             total_coeff= coeff_token>>2;
4963         }else{
4964             total_coeff= pred_non_zero_count(h, n);
4965             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4966             total_coeff= coeff_token>>2;
4967             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4968         }
4969     }
4970
4971     //FIXME set last_non_zero?
4972
4973     if(total_coeff==0)
4974         return 0;
4975
4976     trailing_ones= coeff_token&3;
4977     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4978     assert(total_coeff<=16);
4979
4980     for(i=0; i<trailing_ones; i++){
4981         level[i]= 1 - 2*get_bits1(gb);
4982     }
4983
4984     if(i<total_coeff) {
4985         int level_code, mask;
4986         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4987         int prefix= get_level_prefix(gb);
4988
4989         //first coefficient has suffix_length equal to 0 or 1
4990         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4991             if(suffix_length)
4992                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4993             else
4994                 level_code= (prefix<<suffix_length); //part
4995         }else if(prefix==14){
4996             if(suffix_length)
4997                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4998             else
4999                 level_code= prefix + get_bits(gb, 4); //part
5000         }else if(prefix==15){
5001             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
5002             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
5003         }else{
5004             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5005             return -1;
5006         }
5007
5008         if(trailing_ones < 3) level_code += 2;
5009
5010         suffix_length = 1;
5011         if(level_code > 5)
5012             suffix_length++;
5013         mask= -(level_code&1);
5014         level[i]= (((2+level_code)>>1) ^ mask) - mask;
5015         i++;
5016
5017         //remaining coefficients have suffix_length > 0
5018         for(;i<total_coeff;i++) {
5019             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
5020             prefix = get_level_prefix(gb);
5021             if(prefix<15){
5022                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
5023             }else if(prefix==15){
5024                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
5025             }else{
5026                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5027                 return -1;
5028             }
5029             mask= -(level_code&1);
5030             level[i]= (((2+level_code)>>1) ^ mask) - mask;
5031             if(level_code > suffix_limit[suffix_length])
5032                 suffix_length++;
5033         }
5034     }
5035
5036     if(total_coeff == max_coeff)
5037         zeros_left=0;
5038     else{
5039         if(n == CHROMA_DC_BLOCK_INDEX)
5040             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
5041         else
5042             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
5043     }
5044
5045     coeff_num = zeros_left + total_coeff - 1;
5046     j = scantable[coeff_num];
5047     if(n > 24){
5048         block[j] = level[0];
5049         for(i=1;i<total_coeff;i++) {
5050             if(zeros_left <= 0)
5051                 run_before = 0;
5052             else if(zeros_left < 7){
5053                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5054             }else{
5055                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5056             }
5057             zeros_left -= run_before;
5058             coeff_num -= 1 + run_before;
5059             j= scantable[ coeff_num ];
5060
5061             block[j]= level[i];
5062         }
5063     }else{
5064         block[j] = (level[0] * qmul[j] + 32)>>6;
5065         for(i=1;i<total_coeff;i++) {
5066             if(zeros_left <= 0)
5067                 run_before = 0;
5068             else if(zeros_left < 7){
5069                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5070             }else{
5071                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5072             }
5073             zeros_left -= run_before;
5074             coeff_num -= 1 + run_before;
5075             j= scantable[ coeff_num ];
5076
5077             block[j]= (level[i] * qmul[j] + 32)>>6;
5078         }
5079     }
5080
5081     if(zeros_left<0){
5082         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5083         return -1;
5084     }
5085
5086     return 0;
5087 }
5088
5089 static void predict_field_decoding_flag(H264Context *h){
5090     MpegEncContext * const s = &h->s;
5091     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5092     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5093                 ? s->current_picture.mb_type[mb_xy-1]
5094                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5095                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
5096                 : 0;
5097     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5098 }
5099
5100 /**
5101  * decodes a P_SKIP or B_SKIP macroblock
5102  */
5103 static void decode_mb_skip(H264Context *h){
5104     MpegEncContext * const s = &h->s;
5105     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5106     int mb_type=0;
5107
5108     memset(h->non_zero_count[mb_xy], 0, 16);
5109     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5110
5111     if(MB_FIELD)
5112         mb_type|= MB_TYPE_INTERLACED;
5113
5114     if( h->slice_type == B_TYPE )
5115     {
5116         // just for fill_caches. pred_direct_motion will set the real mb_type
5117         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
5118
5119         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5120         pred_direct_motion(h, &mb_type);
5121         mb_type|= MB_TYPE_SKIP;
5122     }
5123     else
5124     {
5125         int mx, my;
5126         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5127
5128         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5129         pred_pskip_motion(h, &mx, &my);
5130         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5131         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5132     }
5133
5134     write_back_motion(h, mb_type);
5135     s->current_picture.mb_type[mb_xy]= mb_type;
5136     s->current_picture.qscale_table[mb_xy]= s->qscale;
5137     h->slice_table[ mb_xy ]= h->slice_num;
5138     h->prev_mb_skipped= 1;
5139 }
5140
5141 /**
5142  * decodes a macroblock
5143  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5144  */
5145 static int decode_mb_cavlc(H264Context *h){
5146     MpegEncContext * const s = &h->s;
5147     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5148     int mb_type, partition_count, cbp;
5149     int dct8x8_allowed= h->pps.transform_8x8_mode;
5150
5151     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5152
5153     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5154     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5155                 down the code */
5156     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5157         if(s->mb_skip_run==-1)
5158             s->mb_skip_run= get_ue_golomb(&s->gb);
5159
5160         if (s->mb_skip_run--) {
5161             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5162                 if(s->mb_skip_run==0)
5163                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5164                 else
5165                     predict_field_decoding_flag(h);
5166             }
5167             decode_mb_skip(h);
5168             return 0;
5169         }
5170     }
5171     if(FRAME_MBAFF){
5172         if( (s->mb_y&1) == 0 )
5173             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5174     }else
5175         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5176
5177     h->prev_mb_skipped= 0;
5178
5179     mb_type= get_ue_golomb(&s->gb);
5180     if(h->slice_type == B_TYPE){
5181         if(mb_type < 23){
5182             partition_count= b_mb_type_info[mb_type].partition_count;
5183             mb_type=         b_mb_type_info[mb_type].type;
5184         }else{
5185             mb_type -= 23;
5186             goto decode_intra_mb;
5187         }
5188     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5189         if(mb_type < 5){
5190             partition_count= p_mb_type_info[mb_type].partition_count;
5191             mb_type=         p_mb_type_info[mb_type].type;
5192         }else{
5193             mb_type -= 5;
5194             goto decode_intra_mb;
5195         }
5196     }else{
5197        assert(h->slice_type == I_TYPE);
5198 decode_intra_mb:
5199         if(mb_type > 25){
5200             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5201             return -1;
5202         }
5203         partition_count=0;
5204         cbp= i_mb_type_info[mb_type].cbp;
5205         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5206         mb_type= i_mb_type_info[mb_type].type;
5207     }
5208
5209     if(MB_FIELD)
5210         mb_type |= MB_TYPE_INTERLACED;
5211
5212     h->slice_table[ mb_xy ]= h->slice_num;
5213
5214     if(IS_INTRA_PCM(mb_type)){
5215         unsigned int x, y;
5216
5217         // we assume these blocks are very rare so we dont optimize it
5218         align_get_bits(&s->gb);
5219
5220         // The pixels are stored in the same order as levels in h->mb array.
5221         for(y=0; y<16; y++){
5222             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5223             for(x=0; x<16; x++){
5224                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5225                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5226             }
5227         }
5228         for(y=0; y<8; y++){
5229             const int index= 256 + 4*(y&3) + 32*(y>>2);
5230             for(x=0; x<8; x++){
5231                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5232                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5233             }
5234         }
5235         for(y=0; y<8; y++){
5236             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5237             for(x=0; x<8; x++){
5238                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5239                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5240             }
5241         }
5242
5243         // In deblocking, the quantizer is 0
5244         s->current_picture.qscale_table[mb_xy]= 0;
5245         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5246         // All coeffs are present
5247         memset(h->non_zero_count[mb_xy], 16, 16);
5248
5249         s->current_picture.mb_type[mb_xy]= mb_type;
5250         return 0;
5251     }
5252
5253     if(MB_MBAFF){
5254         h->ref_count[0] <<= 1;
5255         h->ref_count[1] <<= 1;
5256     }
5257
5258     fill_caches(h, mb_type, 0);
5259
5260     //mb_pred
5261     if(IS_INTRA(mb_type)){
5262 //            init_top_left_availability(h);
5263             if(IS_INTRA4x4(mb_type)){
5264                 int i;
5265                 int di = 1;
5266                 if(dct8x8_allowed && get_bits1(&s->gb)){
5267                     mb_type |= MB_TYPE_8x8DCT;
5268                     di = 4;
5269                 }
5270
5271 //                fill_intra4x4_pred_table(h);
5272                 for(i=0; i<16; i+=di){
5273                     int mode= pred_intra_mode(h, i);
5274
5275                     if(!get_bits1(&s->gb)){
5276                         const int rem_mode= get_bits(&s->gb, 3);
5277                         mode = rem_mode + (rem_mode >= mode);
5278                     }
5279
5280                     if(di==4)
5281                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5282                     else
5283                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5284                 }
5285                 write_back_intra_pred_mode(h);
5286                 if( check_intra4x4_pred_mode(h) < 0)
5287                     return -1;
5288             }else{
5289                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5290                 if(h->intra16x16_pred_mode < 0)
5291                     return -1;
5292             }
5293             h->chroma_pred_mode= get_ue_golomb(&s->gb);
5294
5295             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
5296             if(h->chroma_pred_mode < 0)
5297                 return -1;
5298     }else if(partition_count==4){
5299         int i, j, sub_partition_count[4], list, ref[2][4];
5300
5301         if(h->slice_type == B_TYPE){
5302             for(i=0; i<4; i++){
5303                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5304                 if(h->sub_mb_type[i] >=13){
5305                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5306                     return -1;
5307                 }
5308                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5309                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5310             }
5311             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5312                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5313                 pred_direct_motion(h, &mb_type);
5314                 h->ref_cache[0][scan8[4]] =
5315                 h->ref_cache[1][scan8[4]] =
5316                 h->ref_cache[0][scan8[12]] =
5317                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5318             }
5319         }else{
5320             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5321             for(i=0; i<4; i++){
5322                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5323                 if(h->sub_mb_type[i] >=4){
5324                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5325                     return -1;
5326                 }
5327                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5328                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5329             }
5330         }
5331
5332         for(list=0; list<2; list++){
5333             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5334             if(ref_count == 0) continue;
5335             for(i=0; i<4; i++){
5336                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5337                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5338                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5339                 }else{
5340                  //FIXME
5341                     ref[list][i] = -1;
5342                 }
5343             }
5344         }
5345
5346         if(dct8x8_allowed)
5347             dct8x8_allowed = get_dct8x8_allowed(h);
5348
5349         for(list=0; list<2; list++){
5350             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5351             if(ref_count == 0) continue;
5352
5353             for(i=0; i<4; i++){
5354                 if(IS_DIRECT(h->sub_mb_type[i])) {
5355                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5356                     continue;
5357                 }
5358                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5359                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5360
5361                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5362                     const int sub_mb_type= h->sub_mb_type[i];
5363                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5364                     for(j=0; j<sub_partition_count[i]; j++){
5365                         int mx, my;
5366                         const int index= 4*i + block_width*j;
5367                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5368                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5369                         mx += get_se_golomb(&s->gb);
5370                         my += get_se_golomb(&s->gb);
5371                         tprintf("final mv:%d %d\n", mx, my);
5372
5373                         if(IS_SUB_8X8(sub_mb_type)){
5374                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5375                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5376                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5377                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5378                         }else if(IS_SUB_8X4(sub_mb_type)){
5379                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5380                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5381                         }else if(IS_SUB_4X8(sub_mb_type)){
5382                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5383                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5384                         }else{
5385                             assert(IS_SUB_4X4(sub_mb_type));
5386                             mv_cache[ 0 ][0]= mx;
5387                             mv_cache[ 0 ][1]= my;
5388                         }
5389                     }
5390                 }else{
5391                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5392                     p[0] = p[1]=
5393                     p[8] = p[9]= 0;
5394                 }
5395             }
5396         }
5397     }else if(IS_DIRECT(mb_type)){
5398         pred_direct_motion(h, &mb_type);
5399         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5400     }else{
5401         int list, mx, my, i;
5402          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5403         if(IS_16X16(mb_type)){
5404             for(list=0; list<2; list++){
5405                 if(h->ref_count[list]>0){
5406                     if(IS_DIR(mb_type, 0, list)){
5407                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5408                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5409                     }else
5410                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5411                 }
5412             }
5413             for(list=0; list<2; list++){
5414                 if(IS_DIR(mb_type, 0, list)){
5415                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5416                     mx += get_se_golomb(&s->gb);
5417                     my += get_se_golomb(&s->gb);
5418                     tprintf("final mv:%d %d\n", mx, my);
5419
5420                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5421                 }else
5422                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5423             }
5424         }
5425         else if(IS_16X8(mb_type)){
5426             for(list=0; list<2; list++){
5427                 if(h->ref_count[list]>0){
5428                     for(i=0; i<2; i++){
5429                         if(IS_DIR(mb_type, i, list)){
5430                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5431                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5432                         }else
5433                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5434                     }
5435                 }
5436             }
5437             for(list=0; list<2; list++){
5438                 for(i=0; i<2; i++){
5439                     if(IS_DIR(mb_type, i, list)){
5440                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5441                         mx += get_se_golomb(&s->gb);
5442                         my += get_se_golomb(&s->gb);
5443                         tprintf("final mv:%d %d\n", mx, my);
5444
5445                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5446                     }else
5447                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5448                 }
5449             }
5450         }else{
5451             assert(IS_8X16(mb_type));
5452             for(list=0; list<2; list++){
5453                 if(h->ref_count[list]>0){
5454                     for(i=0; i<2; i++){
5455                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5456                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5457                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5458                         }else
5459                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5460                     }
5461                 }
5462             }
5463             for(list=0; list<2; list++){
5464                 for(i=0; i<2; i++){
5465                     if(IS_DIR(mb_type, i, list)){
5466                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5467                         mx += get_se_golomb(&s->gb);
5468                         my += get_se_golomb(&s->gb);
5469                         tprintf("final mv:%d %d\n", mx, my);
5470
5471                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5472                     }else
5473                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5474                 }
5475             }
5476         }
5477     }
5478
5479     if(IS_INTER(mb_type))
5480         write_back_motion(h, mb_type);
5481
5482     if(!IS_INTRA16x16(mb_type)){
5483         cbp= get_ue_golomb(&s->gb);
5484         if(cbp > 47){
5485             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5486             return -1;
5487         }
5488
5489         if(IS_INTRA4x4(mb_type))
5490             cbp= golomb_to_intra4x4_cbp[cbp];
5491         else
5492             cbp= golomb_to_inter_cbp[cbp];
5493     }
5494     h->cbp = cbp;
5495
5496     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5497         if(get_bits1(&s->gb))
5498             mb_type |= MB_TYPE_8x8DCT;
5499     }
5500     s->current_picture.mb_type[mb_xy]= mb_type;
5501
5502     if(cbp || IS_INTRA16x16(mb_type)){
5503         int i8x8, i4x4, chroma_idx;
5504         int chroma_qp, dquant;
5505         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5506         const uint8_t *scan, *scan8x8, *dc_scan;
5507
5508 //        fill_non_zero_count_cache(h);
5509
5510         if(IS_INTERLACED(mb_type)){
5511             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5512             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5513             dc_scan= luma_dc_field_scan;
5514         }else{
5515             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5516             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5517             dc_scan= luma_dc_zigzag_scan;
5518         }
5519
5520         dquant= get_se_golomb(&s->gb);
5521
5522         if( dquant > 25 || dquant < -26 ){
5523             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5524             return -1;
5525         }
5526
5527         s->qscale += dquant;
5528         if(((unsigned)s->qscale) > 51){
5529             if(s->qscale<0) s->qscale+= 52;
5530             else            s->qscale-= 52;
5531         }
5532
5533         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5534         if(IS_INTRA16x16(mb_type)){
5535             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5536                 return -1; //FIXME continue if partitioned and other return -1 too
5537             }
5538
5539             assert((cbp&15) == 0 || (cbp&15) == 15);
5540
5541             if(cbp&15){
5542                 for(i8x8=0; i8x8<4; i8x8++){
5543                     for(i4x4=0; i4x4<4; i4x4++){
5544                         const int index= i4x4 + 4*i8x8;
5545                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5546                             return -1;
5547                         }
5548                     }
5549                 }
5550             }else{
5551                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5552             }
5553         }else{
5554             for(i8x8=0; i8x8<4; i8x8++){
5555                 if(cbp & (1<<i8x8)){
5556                     if(IS_8x8DCT(mb_type)){
5557                         DCTELEM *buf = &h->mb[64*i8x8];
5558                         uint8_t *nnz;
5559                         for(i4x4=0; i4x4<4; i4x4++){
5560                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5561                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5562                                 return -1;
5563                         }
5564                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5565                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5566                     }else{
5567                         for(i4x4=0; i4x4<4; i4x4++){
5568                             const int index= i4x4 + 4*i8x8;
5569
5570                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5571                                 return -1;
5572                             }
5573                         }
5574                     }
5575                 }else{
5576                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5577                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5578                 }
5579             }
5580         }
5581
5582         if(cbp&0x30){
5583             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5584                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5585                     return -1;
5586                 }
5587         }
5588
5589         if(cbp&0x20){
5590             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5591                 for(i4x4=0; i4x4<4; i4x4++){
5592                     const int index= 16 + 4*chroma_idx + i4x4;
5593                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5594                         return -1;
5595                     }
5596                 }
5597             }
5598         }else{
5599             uint8_t * const nnz= &h->non_zero_count_cache[0];
5600             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5601             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5602         }
5603     }else{
5604         uint8_t * const nnz= &h->non_zero_count_cache[0];
5605         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5606         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5607         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5608     }
5609     s->current_picture.qscale_table[mb_xy]= s->qscale;
5610     write_back_non_zero_count(h);
5611
5612     if(MB_MBAFF){
5613         h->ref_count[0] >>= 1;
5614         h->ref_count[1] >>= 1;
5615     }
5616
5617     return 0;
5618 }
5619
5620 static int decode_cabac_field_decoding_flag(H264Context *h) {
5621     MpegEncContext * const s = &h->s;
5622     const int mb_x = s->mb_x;
5623     const int mb_y = s->mb_y & ~1;
5624     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5625     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5626
5627     unsigned int ctx = 0;
5628
5629     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5630         ctx += 1;
5631     }
5632     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5633         ctx += 1;
5634     }
5635
5636     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5637 }
5638
5639 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5640     uint8_t *state= &h->cabac_state[ctx_base];
5641     int mb_type;
5642
5643     if(intra_slice){
5644         MpegEncContext * const s = &h->s;
5645         const int mba_xy = h->left_mb_xy[0];
5646         const int mbb_xy = h->top_mb_xy;
5647         int ctx=0;
5648         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5649             ctx++;
5650         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5651             ctx++;
5652         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5653             return 0;   /* I4x4 */
5654         state += 2;
5655     }else{
5656         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5657             return 0;   /* I4x4 */
5658     }
5659
5660     if( get_cabac_terminate( &h->cabac ) )
5661         return 25;  /* PCM */
5662
5663     mb_type = 1; /* I16x16 */
5664     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5665     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5666         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5667     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5668     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5669     return mb_type;
5670 }
5671
5672 static int decode_cabac_mb_type( H264Context *h ) {
5673     MpegEncContext * const s = &h->s;
5674
5675     if( h->slice_type == I_TYPE ) {
5676         return decode_cabac_intra_mb_type(h, 3, 1);
5677     } else if( h->slice_type == P_TYPE ) {
5678         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5679             /* P-type */
5680             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5681                 /* P_L0_D16x16, P_8x8 */
5682                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5683             } else {
5684                 /* P_L0_D8x16, P_L0_D16x8 */
5685                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5686             }
5687         } else {
5688             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5689         }
5690     } else if( h->slice_type == B_TYPE ) {
5691         const int mba_xy = h->left_mb_xy[0];
5692         const int mbb_xy = h->top_mb_xy;
5693         int ctx = 0;
5694         int bits;
5695
5696         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5697             ctx++;
5698         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5699             ctx++;
5700
5701         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5702             return 0; /* B_Direct_16x16 */
5703
5704         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5705             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5706         }
5707
5708         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5709         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5710         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5711         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5712         if( bits < 8 )
5713             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5714         else if( bits == 13 ) {
5715             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5716         } else if( bits == 14 )
5717             return 11; /* B_L1_L0_8x16 */
5718         else if( bits == 15 )
5719             return 22; /* B_8x8 */
5720
5721         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5722         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5723     } else {
5724         /* TODO SI/SP frames? */
5725         return -1;
5726     }
5727 }
5728
5729 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5730     MpegEncContext * const s = &h->s;
5731     int mba_xy, mbb_xy;
5732     int ctx = 0;
5733
5734     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5735         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5736         mba_xy = mb_xy - 1;
5737         if( (mb_y&1)
5738             && h->slice_table[mba_xy] == h->slice_num
5739             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5740             mba_xy += s->mb_stride;
5741         if( MB_FIELD ){
5742             mbb_xy = mb_xy - s->mb_stride;
5743             if( !(mb_y&1)
5744                 && h->slice_table[mbb_xy] == h->slice_num
5745                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5746                 mbb_xy -= s->mb_stride;
5747         }else
5748             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5749     }else{
5750         int mb_xy = mb_x + mb_y*s->mb_stride;
5751         mba_xy = mb_xy - 1;
5752         mbb_xy = mb_xy - s->mb_stride;
5753     }
5754
5755     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5756         ctx++;
5757     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5758         ctx++;
5759
5760     if( h->slice_type == B_TYPE )
5761         ctx += 13;
5762     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5763 }
5764
5765 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5766     int mode = 0;
5767
5768     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5769         return pred_mode;
5770
5771     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5772     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5773     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5774
5775     if( mode >= pred_mode )
5776         return mode + 1;
5777     else
5778         return mode;
5779 }
5780
5781 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5782     const int mba_xy = h->left_mb_xy[0];
5783     const int mbb_xy = h->top_mb_xy;
5784
5785     int ctx = 0;
5786
5787     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5788     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5789         ctx++;
5790
5791     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5792         ctx++;
5793
5794     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5795         return 0;
5796
5797     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5798         return 1;
5799     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5800         return 2;
5801     else
5802         return 3;
5803 }
5804
5805 static const uint8_t block_idx_x[16] = {
5806     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5807 };
5808 static const uint8_t block_idx_y[16] = {
5809     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5810 };
5811 static const uint8_t block_idx_xy[4][4] = {
5812     { 0, 2, 8,  10},
5813     { 1, 3, 9,  11},
5814     { 4, 6, 12, 14},
5815     { 5, 7, 13, 15}
5816 };
5817
5818 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5819     int cbp = 0;
5820     int cbp_b = -1;
5821     int i8x8;
5822
5823     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5824         cbp_b = h->top_cbp;
5825         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5826     }
5827
5828     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5829         int cbp_a = -1;
5830         int x, y;
5831         int ctx = 0;
5832
5833         x = block_idx_x[4*i8x8];
5834         y = block_idx_y[4*i8x8];
5835
5836         if( x > 0 )
5837             cbp_a = cbp;
5838         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5839             cbp_a = h->left_cbp;
5840             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5841         }
5842
5843         if( y > 0 )
5844             cbp_b = cbp;
5845
5846         /* No need to test for skip as we put 0 for skip block */
5847         /* No need to test for IPCM as we put 1 for IPCM block */
5848         if( cbp_a >= 0 ) {
5849             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5850             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5851                 ctx++;
5852         }
5853
5854         if( cbp_b >= 0 ) {
5855             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5856             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5857                 ctx += 2;
5858         }
5859
5860         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5861             cbp |= 1 << i8x8;
5862         }
5863     }
5864     return cbp;
5865 }
5866 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5867     int ctx;
5868     int cbp_a, cbp_b;
5869
5870     cbp_a = (h->left_cbp>>4)&0x03;
5871     cbp_b = (h-> top_cbp>>4)&0x03;
5872
5873     ctx = 0;
5874     if( cbp_a > 0 ) ctx++;
5875     if( cbp_b > 0 ) ctx += 2;
5876     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5877         return 0;
5878
5879     ctx = 4;
5880     if( cbp_a == 2 ) ctx++;
5881     if( cbp_b == 2 ) ctx += 2;
5882     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5883 }
5884 static int decode_cabac_mb_dqp( H264Context *h) {
5885     MpegEncContext * const s = &h->s;
5886     int mbn_xy;
5887     int   ctx = 0;
5888     int   val = 0;
5889
5890     if( s->mb_x > 0 )
5891         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5892     else
5893         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5894
5895     if( h->last_qscale_diff != 0 )
5896         ctx++;
5897
5898     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5899         if( ctx < 2 )
5900             ctx = 2;
5901         else
5902             ctx = 3;
5903         val++;
5904         if(val > 102) //prevent infinite loop
5905             return INT_MIN;
5906     }
5907
5908     if( val&0x01 )
5909         return (val + 1)/2;
5910     else
5911         return -(val + 1)/2;
5912 }
5913 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5914     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5915         return 0;   /* 8x8 */
5916     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5917         return 1;   /* 8x4 */
5918     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5919         return 2;   /* 4x8 */
5920     return 3;       /* 4x4 */
5921 }
5922 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5923     int type;
5924     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5925         return 0;   /* B_Direct_8x8 */
5926     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5927         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5928     type = 3;
5929     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5930         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5931             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5932         type += 4;
5933     }
5934     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5935     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5936     return type;
5937 }
5938
5939 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5940     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5941 }
5942
5943 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5944     int refa = h->ref_cache[list][scan8[n] - 1];
5945     int refb = h->ref_cache[list][scan8[n] - 8];
5946     int ref  = 0;
5947     int ctx  = 0;
5948
5949     if( h->slice_type == B_TYPE) {
5950         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5951             ctx++;
5952         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5953             ctx += 2;
5954     } else {
5955         if( refa > 0 )
5956             ctx++;
5957         if( refb > 0 )
5958             ctx += 2;
5959     }
5960
5961     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5962         ref++;
5963         if( ctx < 4 )
5964             ctx = 4;
5965         else
5966             ctx = 5;
5967     }
5968     return ref;
5969 }
5970
5971 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5972     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5973                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5974     int ctxbase = (l == 0) ? 40 : 47;
5975     int ctx, mvd;
5976
5977     if( amvd < 3 )
5978         ctx = 0;
5979     else if( amvd > 32 )
5980         ctx = 2;
5981     else
5982         ctx = 1;
5983
5984     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5985         return 0;
5986
5987     mvd= 1;
5988     ctx= 3;
5989     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5990         mvd++;
5991         if( ctx < 6 )
5992             ctx++;
5993     }
5994
5995     if( mvd >= 9 ) {
5996         int k = 3;
5997         while( get_cabac_bypass( &h->cabac ) ) {
5998             mvd += 1 << k;
5999             k++;
6000         }
6001         while( k-- ) {
6002             if( get_cabac_bypass( &h->cabac ) )
6003                 mvd += 1 << k;
6004         }
6005     }
6006     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
6007     else                                 return  mvd;
6008 }
6009
6010 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
6011     int nza, nzb;
6012     int ctx = 0;
6013
6014     if( cat == 0 ) {
6015         nza = h->left_cbp&0x100;
6016         nzb = h-> top_cbp&0x100;
6017     } else if( cat == 1 || cat == 2 ) {
6018         nza = h->non_zero_count_cache[scan8[idx] - 1];
6019         nzb = h->non_zero_count_cache[scan8[idx] - 8];
6020     } else if( cat == 3 ) {
6021         nza = (h->left_cbp>>(6+idx))&0x01;
6022         nzb = (h-> top_cbp>>(6+idx))&0x01;
6023     } else {
6024         assert(cat == 4);
6025         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
6026         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
6027     }
6028
6029     if( nza > 0 )
6030         ctx++;
6031
6032     if( nzb > 0 )
6033         ctx += 2;
6034
6035     return ctx + 4 * cat;
6036 }
6037
6038 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
6039     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
6040     static const int significant_coeff_flag_offset[2][6] = {
6041       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
6042       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
6043     };
6044     static const int last_coeff_flag_offset[2][6] = {
6045       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
6046       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
6047     };
6048     static const int coeff_abs_level_m1_offset[6] = {
6049         227+0, 227+10, 227+20, 227+30, 227+39, 426
6050     };
6051     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
6052       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6053         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6054         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6055        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6056       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6057         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6058         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6059         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6060     };
6061     static const uint8_t last_coeff_flag_offset_8x8[63] = {
6062         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6063         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6064         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
6065         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
6066     };
6067
6068     int index[64];
6069
6070     int i, last;
6071     int coeff_count = 0;
6072
6073     int abslevel1 = 1;
6074     int abslevelgt1 = 0;
6075
6076     uint8_t *significant_coeff_ctx_base;
6077     uint8_t *last_coeff_ctx_base;
6078     uint8_t *abs_level_m1_ctx_base;
6079
6080     /* cat: 0-> DC 16x16  n = 0
6081      *      1-> AC 16x16  n = luma4x4idx
6082      *      2-> Luma4x4   n = luma4x4idx
6083      *      3-> DC Chroma n = iCbCr
6084      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6085      *      5-> Luma8x8   n = 4 * luma8x8idx
6086      */
6087
6088     /* read coded block flag */
6089     if( cat != 5 ) {
6090         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6091             if( cat == 1 || cat == 2 )
6092                 h->non_zero_count_cache[scan8[n]] = 0;
6093             else if( cat == 4 )
6094                 h->non_zero_count_cache[scan8[16+n]] = 0;
6095
6096             return 0;
6097         }
6098     }
6099
6100     significant_coeff_ctx_base = h->cabac_state
6101         + significant_coeff_flag_offset[MB_FIELD][cat];
6102     last_coeff_ctx_base = h->cabac_state
6103         + last_coeff_flag_offset[MB_FIELD][cat];
6104     abs_level_m1_ctx_base = h->cabac_state
6105         + coeff_abs_level_m1_offset[cat];
6106
6107     if( cat == 5 ) {
6108 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6109         for(last= 0; last < coefs; last++) { \
6110             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6111             if( get_cabac( &h->cabac, sig_ctx )) { \
6112                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6113                 index[coeff_count++] = last; \
6114                 if( get_cabac( &h->cabac, last_ctx ) ) { \
6115                     last= max_coeff; \
6116                     break; \
6117                 } \
6118             } \
6119         }
6120         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6121         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6122     } else {
6123         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6124     }
6125     if( last == max_coeff -1 ) {
6126         index[coeff_count++] = last;
6127     }
6128     assert(coeff_count > 0);
6129
6130     if( cat == 0 )
6131         h->cbp_table[mb_xy] |= 0x100;
6132     else if( cat == 1 || cat == 2 )
6133         h->non_zero_count_cache[scan8[n]] = coeff_count;
6134     else if( cat == 3 )
6135         h->cbp_table[mb_xy] |= 0x40 << n;
6136     else if( cat == 4 )
6137         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6138     else {
6139         assert( cat == 5 );
6140         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6141     }
6142
6143     for( i = coeff_count - 1; i >= 0; i-- ) {
6144         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6145         int j= scantable[index[i]];
6146
6147         if( get_cabac( &h->cabac, ctx ) == 0 ) {
6148             if( !qmul ) {
6149                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
6150                 else                                block[j] =  1;
6151             }else{
6152                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
6153                 else                                block[j] = ( qmul[j] + 32) >> 6;
6154             }
6155
6156             abslevel1++;
6157         } else {
6158             int coeff_abs = 2;
6159             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6160             while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
6161                 coeff_abs++;
6162             }
6163
6164             if( coeff_abs >= 15 ) {
6165                 int j = 0;
6166                 while( get_cabac_bypass( &h->cabac ) ) {
6167                     coeff_abs += 1 << j;
6168                     j++;
6169                 }
6170
6171                 while( j-- ) {
6172                     if( get_cabac_bypass( &h->cabac ) )
6173                         coeff_abs += 1 << j ;
6174                 }
6175             }
6176
6177             if( !qmul ) {
6178                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
6179                 else                                block[j] =  coeff_abs;
6180             }else{
6181                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6182                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6183             }
6184
6185             abslevelgt1++;
6186         }
6187     }
6188     return 0;
6189 }
6190
6191 static void inline compute_mb_neighbors(H264Context *h)
6192 {
6193     MpegEncContext * const s = &h->s;
6194     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6195     h->top_mb_xy     = mb_xy - s->mb_stride;
6196     h->left_mb_xy[0] = mb_xy - 1;
6197     if(FRAME_MBAFF){
6198         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6199         const int top_pair_xy      = pair_xy     - s->mb_stride;
6200         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6201         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6202         const int curr_mb_frame_flag = !MB_FIELD;
6203         const int bottom = (s->mb_y & 1);
6204         if (bottom
6205                 ? !curr_mb_frame_flag // bottom macroblock
6206                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6207                 ) {
6208             h->top_mb_xy -= s->mb_stride;
6209         }
6210         if (left_mb_frame_flag != curr_mb_frame_flag) {
6211             h->left_mb_xy[0] = pair_xy - 1;
6212         }
6213     }
6214     return;
6215 }
6216
6217 /**
6218  * decodes a macroblock
6219  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6220  */
6221 static int decode_mb_cabac(H264Context *h) {
6222     MpegEncContext * const s = &h->s;
6223     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6224     int mb_type, partition_count, cbp = 0;
6225     int dct8x8_allowed= h->pps.transform_8x8_mode;
6226
6227     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6228
6229     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6230     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6231         int skip;
6232         /* a skipped mb needs the aff flag from the following mb */
6233         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6234             predict_field_decoding_flag(h);
6235         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6236             skip = h->next_mb_skipped;
6237         else
6238             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6239         /* read skip flags */
6240         if( skip ) {
6241             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6242                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6243                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6244                 if(h->next_mb_skipped)
6245                     predict_field_decoding_flag(h);
6246                 else
6247                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6248             }
6249
6250             decode_mb_skip(h);
6251
6252             h->cbp_table[mb_xy] = 0;
6253             h->chroma_pred_mode_table[mb_xy] = 0;
6254             h->last_qscale_diff = 0;
6255
6256             return 0;
6257
6258         }
6259     }
6260     if(FRAME_MBAFF){
6261         if( (s->mb_y&1) == 0 )
6262             h->mb_mbaff =
6263             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6264     }else
6265         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6266
6267     h->prev_mb_skipped = 0;
6268
6269     compute_mb_neighbors(h);
6270     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6271         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6272         return -1;
6273     }
6274
6275     if( h->slice_type == B_TYPE ) {
6276         if( mb_type < 23 ){
6277             partition_count= b_mb_type_info[mb_type].partition_count;
6278             mb_type=         b_mb_type_info[mb_type].type;
6279         }else{
6280             mb_type -= 23;
6281             goto decode_intra_mb;
6282         }
6283     } else if( h->slice_type == P_TYPE ) {
6284         if( mb_type < 5) {
6285             partition_count= p_mb_type_info[mb_type].partition_count;
6286             mb_type=         p_mb_type_info[mb_type].type;
6287         } else {
6288             mb_type -= 5;
6289             goto decode_intra_mb;
6290         }
6291     } else {
6292        assert(h->slice_type == I_TYPE);
6293 decode_intra_mb:
6294         partition_count = 0;
6295         cbp= i_mb_type_info[mb_type].cbp;
6296         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6297         mb_type= i_mb_type_info[mb_type].type;
6298     }
6299     if(MB_FIELD)
6300         mb_type |= MB_TYPE_INTERLACED;
6301
6302     h->slice_table[ mb_xy ]= h->slice_num;
6303
6304     if(IS_INTRA_PCM(mb_type)) {
6305         const uint8_t *ptr;
6306         unsigned int x, y;
6307
6308         // We assume these blocks are very rare so we dont optimize it.
6309         // FIXME The two following lines get the bitstream position in the cabac
6310         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6311         ptr= h->cabac.bytestream;
6312         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
6313
6314         // The pixels are stored in the same order as levels in h->mb array.
6315         for(y=0; y<16; y++){
6316             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6317             for(x=0; x<16; x++){
6318                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6319                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6320             }
6321         }
6322         for(y=0; y<8; y++){
6323             const int index= 256 + 4*(y&3) + 32*(y>>2);
6324             for(x=0; x<8; x++){
6325                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6326                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6327             }
6328         }
6329         for(y=0; y<8; y++){
6330             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6331             for(x=0; x<8; x++){
6332                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6333                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6334             }
6335         }
6336
6337         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6338
6339         // All blocks are present
6340         h->cbp_table[mb_xy] = 0x1ef;
6341         h->chroma_pred_mode_table[mb_xy] = 0;
6342         // In deblocking, the quantizer is 0
6343         s->current_picture.qscale_table[mb_xy]= 0;
6344         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6345         // All coeffs are present
6346         memset(h->non_zero_count[mb_xy], 16, 16);
6347         s->current_picture.mb_type[mb_xy]= mb_type;
6348         return 0;
6349     }
6350
6351     if(MB_MBAFF){
6352         h->ref_count[0] <<= 1;
6353         h->ref_count[1] <<= 1;
6354     }
6355
6356     fill_caches(h, mb_type, 0);
6357
6358     if( IS_INTRA( mb_type ) ) {
6359         int i;
6360         if( IS_INTRA4x4( mb_type ) ) {
6361             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6362                 mb_type |= MB_TYPE_8x8DCT;
6363                 for( i = 0; i < 16; i+=4 ) {
6364                     int pred = pred_intra_mode( h, i );
6365                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6366                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6367                 }
6368             } else {
6369                 for( i = 0; i < 16; i++ ) {
6370                     int pred = pred_intra_mode( h, i );
6371                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6372
6373                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6374                 }
6375             }
6376             write_back_intra_pred_mode(h);
6377             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6378         } else {
6379             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6380             if( h->intra16x16_pred_mode < 0 ) return -1;
6381         }
6382         h->chroma_pred_mode_table[mb_xy] =
6383             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
6384
6385         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
6386         if( h->chroma_pred_mode < 0 ) return -1;
6387     } else if( partition_count == 4 ) {
6388         int i, j, sub_partition_count[4], list, ref[2][4];
6389
6390         if( h->slice_type == B_TYPE ) {
6391             for( i = 0; i < 4; i++ ) {
6392                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6393                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6394                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6395             }
6396             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6397                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6398                 pred_direct_motion(h, &mb_type);
6399                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6400                     for( i = 0; i < 4; i++ )
6401                         if( IS_DIRECT(h->sub_mb_type[i]) )
6402                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6403                 }
6404             }
6405         } else {
6406             for( i = 0; i < 4; i++ ) {
6407                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6408                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6409                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6410             }
6411         }
6412
6413         for( list = 0; list < 2; list++ ) {
6414             if( h->ref_count[list] > 0 ) {
6415                 for( i = 0; i < 4; i++ ) {
6416                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6417                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6418                         if( h->ref_count[list] > 1 )
6419                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6420                         else
6421                             ref[list][i] = 0;
6422                     } else {
6423                         ref[list][i] = -1;
6424                     }
6425                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6426                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6427                 }
6428             }
6429         }
6430
6431         if(dct8x8_allowed)
6432             dct8x8_allowed = get_dct8x8_allowed(h);
6433
6434         for(list=0; list<2; list++){
6435             for(i=0; i<4; i++){
6436                 if(IS_DIRECT(h->sub_mb_type[i])){
6437                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6438                     continue;
6439                 }
6440                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6441
6442                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6443                     const int sub_mb_type= h->sub_mb_type[i];
6444                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6445                     for(j=0; j<sub_partition_count[i]; j++){
6446                         int mpx, mpy;
6447                         int mx, my;
6448                         const int index= 4*i + block_width*j;
6449                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6450                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6451                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6452
6453                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6454                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6455                         tprintf("final mv:%d %d\n", mx, my);
6456
6457                         if(IS_SUB_8X8(sub_mb_type)){
6458                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6459                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6460                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6461                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6462
6463                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6464                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6465                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6466                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6467                         }else if(IS_SUB_8X4(sub_mb_type)){
6468                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6469                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6470
6471                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6472                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6473                         }else if(IS_SUB_4X8(sub_mb_type)){
6474                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6475                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6476
6477                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6478                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6479                         }else{
6480                             assert(IS_SUB_4X4(sub_mb_type));
6481                             mv_cache[ 0 ][0]= mx;
6482                             mv_cache[ 0 ][1]= my;
6483
6484                             mvd_cache[ 0 ][0]= mx - mpx;
6485                             mvd_cache[ 0 ][1]= my - mpy;
6486                         }
6487                     }
6488                 }else{
6489                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6490                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6491                     p[0] = p[1] = p[8] = p[9] = 0;
6492                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6493                 }
6494             }
6495         }
6496     } else if( IS_DIRECT(mb_type) ) {
6497         pred_direct_motion(h, &mb_type);
6498         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6499         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6500         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6501     } else {
6502         int list, mx, my, i, mpx, mpy;
6503         if(IS_16X16(mb_type)){
6504             for(list=0; list<2; list++){
6505                 if(IS_DIR(mb_type, 0, list)){
6506                     if(h->ref_count[list] > 0 ){
6507                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6508                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6509                     }
6510                 }else
6511                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6512             }
6513             for(list=0; list<2; list++){
6514                 if(IS_DIR(mb_type, 0, list)){
6515                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6516
6517                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6518                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6519                     tprintf("final mv:%d %d\n", mx, my);
6520
6521                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6522                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6523                 }else
6524                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6525             }
6526         }
6527         else if(IS_16X8(mb_type)){
6528             for(list=0; list<2; list++){
6529                 if(h->ref_count[list]>0){
6530                     for(i=0; i<2; i++){
6531                         if(IS_DIR(mb_type, i, list)){
6532                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6533                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6534                         }else
6535                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6536                     }
6537                 }
6538             }
6539             for(list=0; list<2; list++){
6540                 for(i=0; i<2; i++){
6541                     if(IS_DIR(mb_type, i, list)){
6542                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6543                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6544                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6545                         tprintf("final mv:%d %d\n", mx, my);
6546
6547                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6548                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6549                     }else{
6550                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6551                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6552                     }
6553                 }
6554             }
6555         }else{
6556             assert(IS_8X16(mb_type));
6557             for(list=0; list<2; list++){
6558                 if(h->ref_count[list]>0){
6559                     for(i=0; i<2; i++){
6560                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6561                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6562                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6563                         }else
6564                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6565                     }
6566                 }
6567             }
6568             for(list=0; list<2; list++){
6569                 for(i=0; i<2; i++){
6570                     if(IS_DIR(mb_type, i, list)){
6571                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6572                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6573                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6574
6575                         tprintf("final mv:%d %d\n", mx, my);
6576                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6577                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6578                     }else{
6579                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6580                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6581                     }
6582                 }
6583             }
6584         }
6585     }
6586
6587    if( IS_INTER( mb_type ) ) {
6588         h->chroma_pred_mode_table[mb_xy] = 0;
6589         write_back_motion( h, mb_type );
6590    }
6591
6592     if( !IS_INTRA16x16( mb_type ) ) {
6593         cbp  = decode_cabac_mb_cbp_luma( h );
6594         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6595     }
6596
6597     h->cbp_table[mb_xy] = h->cbp = cbp;
6598
6599     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6600         if( decode_cabac_mb_transform_size( h ) )
6601             mb_type |= MB_TYPE_8x8DCT;
6602     }
6603     s->current_picture.mb_type[mb_xy]= mb_type;
6604
6605     if( cbp || IS_INTRA16x16( mb_type ) ) {
6606         const uint8_t *scan, *scan8x8, *dc_scan;
6607         int dqp;
6608
6609         if(IS_INTERLACED(mb_type)){
6610             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6611             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6612             dc_scan= luma_dc_field_scan;
6613         }else{
6614             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6615             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6616             dc_scan= luma_dc_zigzag_scan;
6617         }
6618
6619         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6620         if( dqp == INT_MIN ){
6621             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6622             return -1;
6623         }
6624         s->qscale += dqp;
6625         if(((unsigned)s->qscale) > 51){
6626             if(s->qscale<0) s->qscale+= 52;
6627             else            s->qscale-= 52;
6628         }
6629         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6630
6631         if( IS_INTRA16x16( mb_type ) ) {
6632             int i;
6633             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6634             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6635                 return -1;
6636             if( cbp&15 ) {
6637                 for( i = 0; i < 16; i++ ) {
6638                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6639                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6640                         return -1;
6641                 }
6642             } else {
6643                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6644             }
6645         } else {
6646             int i8x8, i4x4;
6647             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6648                 if( cbp & (1<<i8x8) ) {
6649                     if( IS_8x8DCT(mb_type) ) {
6650                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6651                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6652                             return -1;
6653                     } else
6654                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6655                         const int index = 4*i8x8 + i4x4;
6656                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6657 //START_TIMER
6658                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6659                             return -1;
6660 //STOP_TIMER("decode_residual")
6661                     }
6662                 } else {
6663                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6664                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6665                 }
6666             }
6667         }
6668
6669         if( cbp&0x30 ){
6670             int c;
6671             for( c = 0; c < 2; c++ ) {
6672                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6673                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6674                     return -1;
6675             }
6676         }
6677
6678         if( cbp&0x20 ) {
6679             int c, i;
6680             for( c = 0; c < 2; c++ ) {
6681                 for( i = 0; i < 4; i++ ) {
6682                     const int index = 16 + 4 * c + i;
6683                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6684                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6685                         return -1;
6686                 }
6687             }
6688         } else {
6689             uint8_t * const nnz= &h->non_zero_count_cache[0];
6690             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6691             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6692         }
6693     } else {
6694         uint8_t * const nnz= &h->non_zero_count_cache[0];
6695         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6696         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6697         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6698         h->last_qscale_diff = 0;
6699     }
6700
6701     s->current_picture.qscale_table[mb_xy]= s->qscale;
6702     write_back_non_zero_count(h);
6703
6704     if(MB_MBAFF){
6705         h->ref_count[0] >>= 1;
6706         h->ref_count[1] >>= 1;
6707     }
6708
6709     return 0;
6710 }
6711
6712
6713 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6714     int i, d;
6715     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6716     const int alpha = alpha_table[index_a];
6717     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6718
6719     if( bS[0] < 4 ) {
6720         int8_t tc[4];
6721         for(i=0; i<4; i++)
6722             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6723         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6724     } else {
6725         /* 16px edge length, because bS=4 is triggered by being at
6726          * the edge of an intra MB, so all 4 bS are the same */
6727             for( d = 0; d < 16; d++ ) {
6728                 const int p0 = pix[-1];
6729                 const int p1 = pix[-2];
6730                 const int p2 = pix[-3];
6731
6732                 const int q0 = pix[0];
6733                 const int q1 = pix[1];
6734                 const int q2 = pix[2];
6735
6736                 if( FFABS( p0 - q0 ) < alpha &&
6737                     FFABS( p1 - p0 ) < beta &&
6738                     FFABS( q1 - q0 ) < beta ) {
6739
6740                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6741                         if( FFABS( p2 - p0 ) < beta)
6742                         {
6743                             const int p3 = pix[-4];
6744                             /* p0', p1', p2' */
6745                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6746                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6747                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6748                         } else {
6749                             /* p0' */
6750                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6751                         }
6752                         if( FFABS( q2 - q0 ) < beta)
6753                         {
6754                             const int q3 = pix[3];
6755                             /* q0', q1', q2' */
6756                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6757                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6758                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6759                         } else {
6760                             /* q0' */
6761                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6762                         }
6763                     }else{
6764                         /* p0', q0' */
6765                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6766                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6767                     }
6768                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6769                 }
6770                 pix += stride;
6771             }
6772     }
6773 }
6774 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6775     int i;
6776     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6777     const int alpha = alpha_table[index_a];
6778     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6779
6780     if( bS[0] < 4 ) {
6781         int8_t tc[4];
6782         for(i=0; i<4; i++)
6783             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6784         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6785     } else {
6786         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6787     }
6788 }
6789
6790 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6791     int i;
6792     for( i = 0; i < 16; i++, pix += stride) {
6793         int index_a;
6794         int alpha;
6795         int beta;
6796
6797         int qp_index;
6798         int bS_index = (i >> 1);
6799         if (!MB_FIELD) {
6800             bS_index &= ~1;
6801             bS_index |= (i & 1);
6802         }
6803
6804         if( bS[bS_index] == 0 ) {
6805             continue;
6806         }
6807
6808         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6809         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6810         alpha = alpha_table[index_a];
6811         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6812
6813         if( bS[bS_index] < 4 ) {
6814             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6815             const int p0 = pix[-1];
6816             const int p1 = pix[-2];
6817             const int p2 = pix[-3];
6818             const int q0 = pix[0];
6819             const int q1 = pix[1];
6820             const int q2 = pix[2];
6821
6822             if( FFABS( p0 - q0 ) < alpha &&
6823                 FFABS( p1 - p0 ) < beta &&
6824                 FFABS( q1 - q0 ) < beta ) {
6825                 int tc = tc0;
6826                 int i_delta;
6827
6828                 if( FFABS( p2 - p0 ) < beta ) {
6829                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6830                     tc++;
6831                 }
6832                 if( FFABS( q2 - q0 ) < beta ) {
6833                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6834                     tc++;
6835                 }
6836
6837                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6838                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6839                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6840                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6841             }
6842         }else{
6843             const int p0 = pix[-1];
6844             const int p1 = pix[-2];
6845             const int p2 = pix[-3];
6846
6847             const int q0 = pix[0];
6848             const int q1 = pix[1];
6849             const int q2 = pix[2];
6850
6851             if( FFABS( p0 - q0 ) < alpha &&
6852                 FFABS( p1 - p0 ) < beta &&
6853                 FFABS( q1 - q0 ) < beta ) {
6854
6855                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6856                     if( FFABS( p2 - p0 ) < beta)
6857                     {
6858                         const int p3 = pix[-4];
6859                         /* p0', p1', p2' */
6860                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6861                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6862                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6863                     } else {
6864                         /* p0' */
6865                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6866                     }
6867                     if( FFABS( q2 - q0 ) < beta)
6868                     {
6869                         const int q3 = pix[3];
6870                         /* q0', q1', q2' */
6871                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6872                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6873                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6874                     } else {
6875                         /* q0' */
6876                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6877                     }
6878                 }else{
6879                     /* p0', q0' */
6880                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6881                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6882                 }
6883                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6884             }
6885         }
6886     }
6887 }
6888 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6889     int i;
6890     for( i = 0; i < 8; i++, pix += stride) {
6891         int index_a;
6892         int alpha;
6893         int beta;
6894
6895         int qp_index;
6896         int bS_index = i;
6897
6898         if( bS[bS_index] == 0 ) {
6899             continue;
6900         }
6901
6902         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6903         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6904         alpha = alpha_table[index_a];
6905         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6906
6907         if( bS[bS_index] < 4 ) {
6908             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6909             const int p0 = pix[-1];
6910             const int p1 = pix[-2];
6911             const int q0 = pix[0];
6912             const int q1 = pix[1];
6913
6914             if( FFABS( p0 - q0 ) < alpha &&
6915                 FFABS( p1 - p0 ) < beta &&
6916                 FFABS( q1 - q0 ) < beta ) {
6917                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6918
6919                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6920                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6921                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6922             }
6923         }else{
6924             const int p0 = pix[-1];
6925             const int p1 = pix[-2];
6926             const int q0 = pix[0];
6927             const int q1 = pix[1];
6928
6929             if( FFABS( p0 - q0 ) < alpha &&
6930                 FFABS( p1 - p0 ) < beta &&
6931                 FFABS( q1 - q0 ) < beta ) {
6932
6933                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6934                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6935                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6936             }
6937         }
6938     }
6939 }
6940
6941 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6942     int i, d;
6943     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6944     const int alpha = alpha_table[index_a];
6945     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6946     const int pix_next  = stride;
6947
6948     if( bS[0] < 4 ) {
6949         int8_t tc[4];
6950         for(i=0; i<4; i++)
6951             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6952         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6953     } else {
6954         /* 16px edge length, see filter_mb_edgev */
6955             for( d = 0; d < 16; d++ ) {
6956                 const int p0 = pix[-1*pix_next];
6957                 const int p1 = pix[-2*pix_next];
6958                 const int p2 = pix[-3*pix_next];
6959                 const int q0 = pix[0];
6960                 const int q1 = pix[1*pix_next];
6961                 const int q2 = pix[2*pix_next];
6962
6963                 if( FFABS( p0 - q0 ) < alpha &&
6964                     FFABS( p1 - p0 ) < beta &&
6965                     FFABS( q1 - q0 ) < beta ) {
6966
6967                     const int p3 = pix[-4*pix_next];
6968                     const int q3 = pix[ 3*pix_next];
6969
6970                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6971                         if( FFABS( p2 - p0 ) < beta) {
6972                             /* p0', p1', p2' */
6973                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6974                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6975                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6976                         } else {
6977                             /* p0' */
6978                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6979                         }
6980                         if( FFABS( q2 - q0 ) < beta) {
6981                             /* q0', q1', q2' */
6982                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6983                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6984                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6985                         } else {
6986                             /* q0' */
6987                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6988                         }
6989                     }else{
6990                         /* p0', q0' */
6991                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6992                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6993                     }
6994                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6995                 }
6996                 pix++;
6997             }
6998     }
6999 }
7000
7001 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
7002     int i;
7003     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
7004     const int alpha = alpha_table[index_a];
7005     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
7006
7007     if( bS[0] < 4 ) {
7008         int8_t tc[4];
7009         for(i=0; i<4; i++)
7010             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
7011         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
7012     } else {
7013         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
7014     }
7015 }
7016
7017 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7018     MpegEncContext * const s = &h->s;
7019     int mb_xy, mb_type;
7020     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
7021
7022     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
7023         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
7024         return;
7025     }
7026     assert(!FRAME_MBAFF);
7027
7028     mb_xy = mb_x + mb_y*s->mb_stride;
7029     mb_type = s->current_picture.mb_type[mb_xy];
7030     qp = s->current_picture.qscale_table[mb_xy];
7031     qp0 = s->current_picture.qscale_table[mb_xy-1];
7032     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
7033     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
7034     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
7035     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
7036     qp0 = (qp + qp0 + 1) >> 1;
7037     qp1 = (qp + qp1 + 1) >> 1;
7038     qpc0 = (qpc + qpc0 + 1) >> 1;
7039     qpc1 = (qpc + qpc1 + 1) >> 1;
7040     qp_thresh = 15 - h->slice_alpha_c0_offset;
7041     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
7042        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
7043         return;
7044
7045     if( IS_INTRA(mb_type) ) {
7046         int16_t bS4[4] = {4,4,4,4};
7047         int16_t bS3[4] = {3,3,3,3};
7048         if( IS_8x8DCT(mb_type) ) {
7049             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7050             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7051             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7052             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7053         } else {
7054             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7055             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
7056             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7057             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
7058             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7059             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
7060             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7061             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
7062         }
7063         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
7064         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
7065         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
7066         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
7067         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7068         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
7069         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7070         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
7071         return;
7072     } else {
7073         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
7074         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
7075         int edges;
7076         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
7077             edges = 4;
7078             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
7079         } else {
7080             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
7081                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
7082             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
7083                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
7084                              ? 3 : 0;
7085             int step = IS_8x8DCT(mb_type) ? 2 : 1;
7086             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
7087             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
7088                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
7089         }
7090         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
7091             bSv[0][0] = 0x0004000400040004ULL;
7092         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
7093             bSv[1][0] = 0x0004000400040004ULL;
7094
7095 #define FILTER(hv,dir,edge)\
7096         if(bSv[dir][edge]) {\
7097             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
7098             if(!(edge&1)) {\
7099                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7100                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7101             }\
7102         }
7103         if( edges == 1 ) {
7104             FILTER(v,0,0);
7105             FILTER(h,1,0);
7106         } else if( IS_8x8DCT(mb_type) ) {
7107             FILTER(v,0,0);
7108             FILTER(v,0,2);
7109             FILTER(h,1,0);
7110             FILTER(h,1,2);
7111         } else {
7112             FILTER(v,0,0);
7113             FILTER(v,0,1);
7114             FILTER(v,0,2);
7115             FILTER(v,0,3);
7116             FILTER(h,1,0);
7117             FILTER(h,1,1);
7118             FILTER(h,1,2);
7119             FILTER(h,1,3);
7120         }
7121 #undef FILTER
7122     }
7123 }
7124
7125 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7126     MpegEncContext * const s = &h->s;
7127     const int mb_xy= mb_x + mb_y*s->mb_stride;
7128     const int mb_type = s->current_picture.mb_type[mb_xy];
7129     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7130     int first_vertical_edge_done = 0;
7131     int dir;
7132     /* FIXME: A given frame may occupy more than one position in
7133      * the reference list. So ref2frm should be populated with
7134      * frame numbers, not indices. */
7135     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7136                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7137
7138     //for sufficiently low qp, filtering wouldn't do anything
7139     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7140     if(!FRAME_MBAFF){
7141         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7142         int qp = s->current_picture.qscale_table[mb_xy];
7143         if(qp <= qp_thresh
7144            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7145            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7146             return;
7147         }
7148     }
7149
7150     if (FRAME_MBAFF
7151             // left mb is in picture
7152             && h->slice_table[mb_xy-1] != 255
7153             // and current and left pair do not have the same interlaced type
7154             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7155             // and left mb is in the same slice if deblocking_filter == 2
7156             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7157         /* First vertical edge is different in MBAFF frames
7158          * There are 8 different bS to compute and 2 different Qp
7159          */
7160         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7161         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7162         int16_t bS[8];
7163         int qp[2];
7164         int chroma_qp[2];
7165         int mb_qp, mbn0_qp, mbn1_qp;
7166         int i;
7167         first_vertical_edge_done = 1;
7168
7169         if( IS_INTRA(mb_type) )
7170             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7171         else {
7172             for( i = 0; i < 8; i++ ) {
7173                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7174
7175                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7176                     bS[i] = 4;
7177                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7178                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7179                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7180                     bS[i] = 2;
7181                 else
7182                     bS[i] = 1;
7183             }
7184         }
7185
7186         mb_qp = s->current_picture.qscale_table[mb_xy];
7187         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7188         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7189         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7190         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7191                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7192         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7193         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7194                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7195
7196         /* Filter edge */
7197         tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7198         { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7199         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7200         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7201         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7202     }
7203     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7204     for( dir = 0; dir < 2; dir++ )
7205     {
7206         int edge;
7207         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7208         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7209         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7210
7211         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7212                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7213         // how often to recheck mv-based bS when iterating between edges
7214         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7215                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7216         // how often to recheck mv-based bS when iterating along each edge
7217         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7218
7219         if (first_vertical_edge_done) {
7220             start = 1;
7221             first_vertical_edge_done = 0;
7222         }
7223
7224         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7225             start = 1;
7226
7227         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7228             && !IS_INTERLACED(mb_type)
7229             && IS_INTERLACED(mbm_type)
7230             ) {
7231             // This is a special case in the norm where the filtering must
7232             // be done twice (one each of the field) even if we are in a
7233             // frame macroblock.
7234             //
7235             static const int nnz_idx[4] = {4,5,6,3};
7236             unsigned int tmp_linesize   = 2 *   linesize;
7237             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7238             int mbn_xy = mb_xy - 2 * s->mb_stride;
7239             int qp, chroma_qp;
7240             int i, j;
7241             int16_t bS[4];
7242
7243             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7244                 if( IS_INTRA(mb_type) ||
7245                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7246                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7247                 } else {
7248                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7249                     for( i = 0; i < 4; i++ ) {
7250                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7251                             mbn_nnz[nnz_idx[i]] != 0 )
7252                             bS[i] = 2;
7253                         else
7254                             bS[i] = 1;
7255                     }
7256                 }
7257                 // Do not use s->qscale as luma quantizer because it has not the same
7258                 // value in IPCM macroblocks.
7259                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7260                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7261                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7262                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7263                 chroma_qp = ( h->chroma_qp +
7264                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7265                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7266                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7267             }
7268
7269             start = 1;
7270         }
7271
7272         /* Calculate bS */
7273         for( edge = start; edge < edges; edge++ ) {
7274             /* mbn_xy: neighbor macroblock */
7275             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7276             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7277             int16_t bS[4];
7278             int qp;
7279
7280             if( (edge&1) && IS_8x8DCT(mb_type) )
7281                 continue;
7282
7283             if( IS_INTRA(mb_type) ||
7284                 IS_INTRA(mbn_type) ) {
7285                 int value;
7286                 if (edge == 0) {
7287                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7288                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7289                     ) {
7290                         value = 4;
7291                     } else {
7292                         value = 3;
7293                     }
7294                 } else {
7295                     value = 3;
7296                 }
7297                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7298             } else {
7299                 int i, l;
7300                 int mv_done;
7301
7302                 if( edge & mask_edge ) {
7303                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7304                     mv_done = 1;
7305                 }
7306                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7307                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7308                     mv_done = 1;
7309                 }
7310                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7311                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7312                     int bn_idx= b_idx - (dir ? 8:1);
7313                     int v = 0;
7314                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7315                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7316                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7317                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7318                     }
7319                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7320                     mv_done = 1;
7321                 }
7322                 else
7323                     mv_done = 0;
7324
7325                 for( i = 0; i < 4; i++ ) {
7326                     int x = dir == 0 ? edge : i;
7327                     int y = dir == 0 ? i    : edge;
7328                     int b_idx= 8 + 4 + x + 8*y;
7329                     int bn_idx= b_idx - (dir ? 8:1);
7330
7331                     if( h->non_zero_count_cache[b_idx] != 0 ||
7332                         h->non_zero_count_cache[bn_idx] != 0 ) {
7333                         bS[i] = 2;
7334                     }
7335                     else if(!mv_done)
7336                     {
7337                         bS[i] = 0;
7338                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7339                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7340                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7341                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7342                                 bS[i] = 1;
7343                                 break;
7344                             }
7345                         }
7346                     }
7347                 }
7348
7349                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7350                     continue;
7351             }
7352
7353             /* Filter edge */
7354             // Do not use s->qscale as luma quantizer because it has not the same
7355             // value in IPCM macroblocks.
7356             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7357             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7358             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7359             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7360             if( dir == 0 ) {
7361                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7362                 if( (edge&1) == 0 ) {
7363                     int chroma_qp = ( h->chroma_qp +
7364                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7365                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7366                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7367                 }
7368             } else {
7369                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7370                 if( (edge&1) == 0 ) {
7371                     int chroma_qp = ( h->chroma_qp +
7372                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7373                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7374                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7375                 }
7376             }
7377         }
7378     }
7379 }
7380
7381 static int decode_slice(H264Context *h){
7382     MpegEncContext * const s = &h->s;
7383     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7384
7385     s->mb_skip_run= -1;
7386
7387     if( h->pps.cabac ) {
7388         int i;
7389
7390         /* realign */
7391         align_get_bits( &s->gb );
7392
7393         /* init cabac */
7394         ff_init_cabac_states( &h->cabac);
7395         ff_init_cabac_decoder( &h->cabac,
7396                                s->gb.buffer + get_bits_count(&s->gb)/8,
7397                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7398         /* calculate pre-state */
7399         for( i= 0; i < 460; i++ ) {
7400             int pre;
7401             if( h->slice_type == I_TYPE )
7402                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7403             else
7404                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7405
7406             if( pre <= 63 )
7407                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7408             else
7409                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7410         }
7411
7412         for(;;){
7413 //START_TIMER
7414             int ret = decode_mb_cabac(h);
7415             int eos;
7416 //STOP_TIMER("decode_mb_cabac")
7417
7418             if(ret>=0) hl_decode_mb(h);
7419
7420             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7421                 s->mb_y++;
7422
7423                 if(ret>=0) ret = decode_mb_cabac(h);
7424
7425                 if(ret>=0) hl_decode_mb(h);
7426                 s->mb_y--;
7427             }
7428             eos = get_cabac_terminate( &h->cabac );
7429
7430             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7431                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7432                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7433                 return -1;
7434             }
7435
7436             if( ++s->mb_x >= s->mb_width ) {
7437                 s->mb_x = 0;
7438                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7439                 ++s->mb_y;
7440                 if(FRAME_MBAFF) {
7441                     ++s->mb_y;
7442                 }
7443             }
7444
7445             if( eos || s->mb_y >= s->mb_height ) {
7446                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7447                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7448                 return 0;
7449             }
7450         }
7451
7452     } else {
7453         for(;;){
7454             int ret = decode_mb_cavlc(h);
7455
7456             if(ret>=0) hl_decode_mb(h);
7457
7458             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7459                 s->mb_y++;
7460                 ret = decode_mb_cavlc(h);
7461
7462                 if(ret>=0) hl_decode_mb(h);
7463                 s->mb_y--;
7464             }
7465
7466             if(ret<0){
7467                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7468                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7469
7470                 return -1;
7471             }
7472
7473             if(++s->mb_x >= s->mb_width){
7474                 s->mb_x=0;
7475                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7476                 ++s->mb_y;
7477                 if(FRAME_MBAFF) {
7478                     ++s->mb_y;
7479                 }
7480                 if(s->mb_y >= s->mb_height){
7481                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7482
7483                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7484                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7485
7486                         return 0;
7487                     }else{
7488                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7489
7490                         return -1;
7491                     }
7492                 }
7493             }
7494
7495             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7496                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7497                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7498                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7499
7500                     return 0;
7501                 }else{
7502                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7503
7504                     return -1;
7505                 }
7506             }
7507         }
7508     }
7509
7510 #if 0
7511     for(;s->mb_y < s->mb_height; s->mb_y++){
7512         for(;s->mb_x < s->mb_width; s->mb_x++){
7513             int ret= decode_mb(h);
7514
7515             hl_decode_mb(h);
7516
7517             if(ret<0){
7518                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7519                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7520
7521                 return -1;
7522             }
7523
7524             if(++s->mb_x >= s->mb_width){
7525                 s->mb_x=0;
7526                 if(++s->mb_y >= s->mb_height){
7527                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7528                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7529
7530                         return 0;
7531                     }else{
7532                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7533
7534                         return -1;
7535                     }
7536                 }
7537             }
7538
7539             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7540                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7541                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7542
7543                     return 0;
7544                 }else{
7545                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7546
7547                     return -1;
7548                 }
7549             }
7550         }
7551         s->mb_x=0;
7552         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7553     }
7554 #endif
7555     return -1; //not reached
7556 }
7557
7558 static int decode_unregistered_user_data(H264Context *h, int size){
7559     MpegEncContext * const s = &h->s;
7560     uint8_t user_data[16+256];
7561     int e, build, i;
7562
7563     if(size<16)
7564         return -1;
7565
7566     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7567         user_data[i]= get_bits(&s->gb, 8);
7568     }
7569
7570     user_data[i]= 0;
7571     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7572     if(e==1 && build>=0)
7573         h->x264_build= build;
7574
7575     if(s->avctx->debug & FF_DEBUG_BUGS)
7576         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7577
7578     for(; i<size; i++)
7579         skip_bits(&s->gb, 8);
7580
7581     return 0;
7582 }
7583
7584 static int decode_sei(H264Context *h){
7585     MpegEncContext * const s = &h->s;
7586
7587     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7588         int size, type;
7589
7590         type=0;
7591         do{
7592             type+= show_bits(&s->gb, 8);
7593         }while(get_bits(&s->gb, 8) == 255);
7594
7595         size=0;
7596         do{
7597             size+= show_bits(&s->gb, 8);
7598         }while(get_bits(&s->gb, 8) == 255);
7599
7600         switch(type){
7601         case 5:
7602             if(decode_unregistered_user_data(h, size) < 0)
7603                 return -1;
7604             break;
7605         default:
7606             skip_bits(&s->gb, 8*size);
7607         }
7608
7609         //FIXME check bits here
7610         align_get_bits(&s->gb);
7611     }
7612
7613     return 0;
7614 }
7615
7616 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7617     MpegEncContext * const s = &h->s;
7618     int cpb_count, i;
7619     cpb_count = get_ue_golomb(&s->gb) + 1;
7620     get_bits(&s->gb, 4); /* bit_rate_scale */
7621     get_bits(&s->gb, 4); /* cpb_size_scale */
7622     for(i=0; i<cpb_count; i++){
7623         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7624         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7625         get_bits1(&s->gb);     /* cbr_flag */
7626     }
7627     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7628     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7629     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7630     get_bits(&s->gb, 5); /* time_offset_length */
7631 }
7632
7633 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7634     MpegEncContext * const s = &h->s;
7635     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7636     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7637
7638     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7639
7640     if( aspect_ratio_info_present_flag ) {
7641         aspect_ratio_idc= get_bits(&s->gb, 8);
7642         if( aspect_ratio_idc == EXTENDED_SAR ) {
7643             sps->sar.num= get_bits(&s->gb, 16);
7644             sps->sar.den= get_bits(&s->gb, 16);
7645         }else if(aspect_ratio_idc < 14){
7646             sps->sar=  pixel_aspect[aspect_ratio_idc];
7647         }else{
7648             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7649             return -1;
7650         }
7651     }else{
7652         sps->sar.num=
7653         sps->sar.den= 0;
7654     }
7655 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7656
7657     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7658         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7659     }
7660
7661     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7662         get_bits(&s->gb, 3);    /* video_format */
7663         get_bits1(&s->gb);      /* video_full_range_flag */
7664         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7665             get_bits(&s->gb, 8); /* colour_primaries */
7666             get_bits(&s->gb, 8); /* transfer_characteristics */
7667             get_bits(&s->gb, 8); /* matrix_coefficients */
7668         }
7669     }
7670
7671     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7672         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7673         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7674     }
7675
7676     sps->timing_info_present_flag = get_bits1(&s->gb);
7677     if(sps->timing_info_present_flag){
7678         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7679         sps->time_scale = get_bits_long(&s->gb, 32);
7680         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7681     }
7682
7683     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7684     if(nal_hrd_parameters_present_flag)
7685         decode_hrd_parameters(h, sps);
7686     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7687     if(vcl_hrd_parameters_present_flag)
7688         decode_hrd_parameters(h, sps);
7689     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7690         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7691     get_bits1(&s->gb);         /* pic_struct_present_flag */
7692
7693     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7694     if(sps->bitstream_restriction_flag){
7695         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7696         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7697         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7698         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7699         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7700         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7701         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7702     }
7703
7704     return 0;
7705 }
7706
7707 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7708                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7709     MpegEncContext * const s = &h->s;
7710     int i, last = 8, next = 8;
7711     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7712     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7713         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7714     else
7715     for(i=0;i<size;i++){
7716         if(next)
7717             next = (last + get_se_golomb(&s->gb)) & 0xff;
7718         if(!i && !next){ /* matrix not written, we use the preset one */
7719             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7720             break;
7721         }
7722         last = factors[scan[i]] = next ? next : last;
7723     }
7724 }
7725
7726 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7727                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7728     MpegEncContext * const s = &h->s;
7729     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7730     const uint8_t *fallback[4] = {
7731         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7732         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7733         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7734         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7735     };
7736     if(get_bits1(&s->gb)){
7737         sps->scaling_matrix_present |= is_sps;
7738         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7739         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7740         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7741         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7742         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7743         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7744         if(is_sps || pps->transform_8x8_mode){
7745             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7746             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7747         }
7748     } else if(fallback_sps) {
7749         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7750         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7751     }
7752 }
7753
7754 static inline int decode_seq_parameter_set(H264Context *h){
7755     MpegEncContext * const s = &h->s;
7756     int profile_idc, level_idc;
7757     int sps_id, i;
7758     SPS *sps;
7759
7760     profile_idc= get_bits(&s->gb, 8);
7761     get_bits1(&s->gb);   //constraint_set0_flag
7762     get_bits1(&s->gb);   //constraint_set1_flag
7763     get_bits1(&s->gb);   //constraint_set2_flag
7764     get_bits1(&s->gb);   //constraint_set3_flag
7765     get_bits(&s->gb, 4); // reserved
7766     level_idc= get_bits(&s->gb, 8);
7767     sps_id= get_ue_golomb(&s->gb);
7768
7769     sps= &h->sps_buffer[ sps_id ];
7770     sps->profile_idc= profile_idc;
7771     sps->level_idc= level_idc;
7772
7773     if(sps->profile_idc >= 100){ //high profile
7774         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7775             get_bits1(&s->gb);  //residual_color_transform_flag
7776         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7777         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7778         sps->transform_bypass = get_bits1(&s->gb);
7779         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7780     }else
7781         sps->scaling_matrix_present = 0;
7782
7783     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7784     sps->poc_type= get_ue_golomb(&s->gb);
7785
7786     if(sps->poc_type == 0){ //FIXME #define
7787         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7788     } else if(sps->poc_type == 1){//FIXME #define
7789         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7790         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7791         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7792         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7793
7794         for(i=0; i<sps->poc_cycle_length; i++)
7795             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7796     }
7797     if(sps->poc_type > 2){
7798         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7799         return -1;
7800     }
7801
7802     sps->ref_frame_count= get_ue_golomb(&s->gb);
7803     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7804         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7805     }
7806     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7807     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7808     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7809     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7810        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7811         return -1;
7812
7813     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7814     if(!sps->frame_mbs_only_flag)
7815         sps->mb_aff= get_bits1(&s->gb);
7816     else
7817         sps->mb_aff= 0;
7818
7819     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7820
7821 #ifndef ALLOW_INTERLACE
7822     if(sps->mb_aff)
7823         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7824 #endif
7825     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7826         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7827
7828     sps->crop= get_bits1(&s->gb);
7829     if(sps->crop){
7830         sps->crop_left  = get_ue_golomb(&s->gb);
7831         sps->crop_right = get_ue_golomb(&s->gb);
7832         sps->crop_top   = get_ue_golomb(&s->gb);
7833         sps->crop_bottom= get_ue_golomb(&s->gb);
7834         if(sps->crop_left || sps->crop_top){
7835             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7836         }
7837     }else{
7838         sps->crop_left  =
7839         sps->crop_right =
7840         sps->crop_top   =
7841         sps->crop_bottom= 0;
7842     }
7843
7844     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7845     if( sps->vui_parameters_present_flag )
7846         decode_vui_parameters(h, sps);
7847
7848     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7849         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7850                sps_id, sps->profile_idc, sps->level_idc,
7851                sps->poc_type,
7852                sps->ref_frame_count,
7853                sps->mb_width, sps->mb_height,
7854                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7855                sps->direct_8x8_inference_flag ? "8B8" : "",
7856                sps->crop_left, sps->crop_right,
7857                sps->crop_top, sps->crop_bottom,
7858                sps->vui_parameters_present_flag ? "VUI" : ""
7859                );
7860     }
7861     return 0;
7862 }
7863
7864 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7865     MpegEncContext * const s = &h->s;
7866     int pps_id= get_ue_golomb(&s->gb);
7867     PPS *pps= &h->pps_buffer[pps_id];
7868
7869     pps->sps_id= get_ue_golomb(&s->gb);
7870     pps->cabac= get_bits1(&s->gb);
7871     pps->pic_order_present= get_bits1(&s->gb);
7872     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7873     if(pps->slice_group_count > 1 ){
7874         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7875         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7876         switch(pps->mb_slice_group_map_type){
7877         case 0:
7878 #if 0
7879 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7880 |    run_length[ i ]                                |1  |ue(v)   |
7881 #endif
7882             break;
7883         case 2:
7884 #if 0
7885 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7886 |{                                                  |   |        |
7887 |    top_left_mb[ i ]                               |1  |ue(v)   |
7888 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7889 |   }                                               |   |        |
7890 #endif
7891             break;
7892         case 3:
7893         case 4:
7894         case 5:
7895 #if 0
7896 |   slice_group_change_direction_flag               |1  |u(1)    |
7897 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7898 #endif
7899             break;
7900         case 6:
7901 #if 0
7902 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7903 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7904 |)                                                  |   |        |
7905 |    slice_group_id[ i ]                            |1  |u(v)    |
7906 #endif
7907             break;
7908         }
7909     }
7910     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7911     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7912     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7913         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7914         return -1;
7915     }
7916
7917     pps->weighted_pred= get_bits1(&s->gb);
7918     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7919     pps->init_qp= get_se_golomb(&s->gb) + 26;
7920     pps->init_qs= get_se_golomb(&s->gb) + 26;
7921     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7922     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7923     pps->constrained_intra_pred= get_bits1(&s->gb);
7924     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7925
7926     pps->transform_8x8_mode= 0;
7927     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7928     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7929     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7930
7931     if(get_bits_count(&s->gb) < bit_length){
7932         pps->transform_8x8_mode= get_bits1(&s->gb);
7933         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7934         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7935     }
7936
7937     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7938         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7939                pps_id, pps->sps_id,
7940                pps->cabac ? "CABAC" : "CAVLC",
7941                pps->slice_group_count,
7942                pps->ref_count[0], pps->ref_count[1],
7943                pps->weighted_pred ? "weighted" : "",
7944                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7945                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7946                pps->constrained_intra_pred ? "CONSTR" : "",
7947                pps->redundant_pic_cnt_present ? "REDU" : "",
7948                pps->transform_8x8_mode ? "8x8DCT" : ""
7949                );
7950     }
7951
7952     return 0;
7953 }
7954
7955 /**
7956  * finds the end of the current frame in the bitstream.
7957  * @return the position of the first byte of the next frame, or -1
7958  */
7959 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7960     int i;
7961     uint32_t state;
7962     ParseContext *pc = &(h->s.parse_context);
7963 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7964 //    mb_addr= pc->mb_addr - 1;
7965     state= pc->state;
7966     for(i=0; i<=buf_size; i++){
7967         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7968             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7969             if(pc->frame_start_found){
7970                 // If there isn't one more byte in the buffer
7971                 // the test on first_mb_in_slice cannot be done yet
7972                 // do it at next call.
7973                 if (i >= buf_size) break;
7974                 if (buf[i] & 0x80) {
7975                     // first_mb_in_slice is 0, probably the first nal of a new
7976                     // slice
7977                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7978                     pc->state=-1;
7979                     pc->frame_start_found= 0;
7980                     return i-4;
7981                 }
7982             }
7983             pc->frame_start_found = 1;
7984         }
7985         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7986            if(pc->frame_start_found){
7987                 pc->state=-1;
7988                 pc->frame_start_found= 0;
7989                 return i-4;
7990            }
7991         }
7992         if (i<buf_size)
7993             state= (state<<8) | buf[i];
7994     }
7995
7996     pc->state= state;
7997     return END_NOT_FOUND;
7998 }
7999
8000 #ifdef CONFIG_H264_PARSER
8001 static int h264_parse(AVCodecParserContext *s,
8002                       AVCodecContext *avctx,
8003                       uint8_t **poutbuf, int *poutbuf_size,
8004                       const uint8_t *buf, int buf_size)
8005 {
8006     H264Context *h = s->priv_data;
8007     ParseContext *pc = &h->s.parse_context;
8008     int next;
8009
8010     next= find_frame_end(h, buf, buf_size);
8011
8012     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
8013         *poutbuf = NULL;
8014         *poutbuf_size = 0;
8015         return buf_size;
8016     }
8017
8018     *poutbuf = (uint8_t *)buf;
8019     *poutbuf_size = buf_size;
8020     return next;
8021 }
8022
8023 static int h264_split(AVCodecContext *avctx,
8024                       const uint8_t *buf, int buf_size)
8025 {
8026     int i;
8027     uint32_t state = -1;
8028     int has_sps= 0;
8029
8030     for(i=0; i<=buf_size; i++){
8031         if((state&0xFFFFFF1F) == 0x107)
8032             has_sps=1;
8033 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
8034         }*/
8035         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
8036             if(has_sps){
8037                 while(i>4 && buf[i-5]==0) i--;
8038                 return i-4;
8039             }
8040         }
8041         if (i<buf_size)
8042             state= (state<<8) | buf[i];
8043     }
8044     return 0;
8045 }
8046 #endif /* CONFIG_H264_PARSER */
8047
8048 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
8049     MpegEncContext * const s = &h->s;
8050     AVCodecContext * const avctx= s->avctx;
8051     int buf_index=0;
8052 #if 0
8053     int i;
8054     for(i=0; i<50; i++){
8055         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
8056     }
8057 #endif
8058     h->slice_num = 0;
8059     s->current_picture_ptr= NULL;
8060     for(;;){
8061         int consumed;
8062         int dst_length;
8063         int bit_length;
8064         uint8_t *ptr;
8065         int i, nalsize = 0;
8066
8067       if(h->is_avc) {
8068         if(buf_index >= buf_size) break;
8069         nalsize = 0;
8070         for(i = 0; i < h->nal_length_size; i++)
8071             nalsize = (nalsize << 8) | buf[buf_index++];
8072         if(nalsize <= 1){
8073             if(nalsize == 1){
8074                 buf_index++;
8075                 continue;
8076             }else{
8077                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
8078                 break;
8079             }
8080         }
8081       } else {
8082         // start code prefix search
8083         for(; buf_index + 3 < buf_size; buf_index++){
8084             // this should allways succeed in the first iteration
8085             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
8086                 break;
8087         }
8088
8089         if(buf_index+3 >= buf_size) break;
8090
8091         buf_index+=3;
8092       }
8093
8094         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
8095         while(ptr[dst_length - 1] == 0 && dst_length > 1)
8096             dst_length--;
8097         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
8098
8099         if(s->avctx->debug&FF_DEBUG_STARTCODE){
8100             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
8101         }
8102
8103         if (h->is_avc && (nalsize != consumed))
8104             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
8105
8106         buf_index += consumed;
8107
8108         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
8109            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
8110             continue;
8111
8112         switch(h->nal_unit_type){
8113         case NAL_IDR_SLICE:
8114             idr(h); //FIXME ensure we don't loose some frames if there is reordering
8115         case NAL_SLICE:
8116             init_get_bits(&s->gb, ptr, bit_length);
8117             h->intra_gb_ptr=
8118             h->inter_gb_ptr= &s->gb;
8119             s->data_partitioning = 0;
8120
8121             if(decode_slice_header(h) < 0){
8122                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8123                 break;
8124             }
8125             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8126             if(h->redundant_pic_count==0 && s->hurry_up < 5
8127                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8128                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8129                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8130                && avctx->skip_frame < AVDISCARD_ALL)
8131                 decode_slice(h);
8132             break;
8133         case NAL_DPA:
8134             init_get_bits(&s->gb, ptr, bit_length);
8135             h->intra_gb_ptr=
8136             h->inter_gb_ptr= NULL;
8137             s->data_partitioning = 1;
8138
8139             if(decode_slice_header(h) < 0){
8140                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8141             }
8142             break;
8143         case NAL_DPB:
8144             init_get_bits(&h->intra_gb, ptr, bit_length);
8145             h->intra_gb_ptr= &h->intra_gb;
8146             break;
8147         case NAL_DPC:
8148             init_get_bits(&h->inter_gb, ptr, bit_length);
8149             h->inter_gb_ptr= &h->inter_gb;
8150
8151             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8152                && s->hurry_up < 5
8153                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8154                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8155                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8156                && avctx->skip_frame < AVDISCARD_ALL)
8157                 decode_slice(h);
8158             break;
8159         case NAL_SEI:
8160             init_get_bits(&s->gb, ptr, bit_length);
8161             decode_sei(h);
8162             break;
8163         case NAL_SPS:
8164             init_get_bits(&s->gb, ptr, bit_length);
8165             decode_seq_parameter_set(h);
8166
8167             if(s->flags& CODEC_FLAG_LOW_DELAY)
8168                 s->low_delay=1;
8169
8170             if(avctx->has_b_frames < 2)
8171                 avctx->has_b_frames= !s->low_delay;
8172             break;
8173         case NAL_PPS:
8174             init_get_bits(&s->gb, ptr, bit_length);
8175
8176             decode_picture_parameter_set(h, bit_length);
8177
8178             break;
8179         case NAL_AUD:
8180         case NAL_END_SEQUENCE:
8181         case NAL_END_STREAM:
8182         case NAL_FILLER_DATA:
8183         case NAL_SPS_EXT:
8184         case NAL_AUXILIARY_SLICE:
8185             break;
8186         default:
8187             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8188         }
8189     }
8190
8191     if(!s->current_picture_ptr) return buf_index; //no frame
8192
8193     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8194     s->current_picture_ptr->pict_type= s->pict_type;
8195
8196     h->prev_frame_num_offset= h->frame_num_offset;
8197     h->prev_frame_num= h->frame_num;
8198     if(s->current_picture_ptr->reference){
8199         h->prev_poc_msb= h->poc_msb;
8200         h->prev_poc_lsb= h->poc_lsb;
8201     }
8202     if(s->current_picture_ptr->reference)
8203         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8204
8205     ff_er_frame_end(s);
8206
8207     MPV_frame_end(s);
8208
8209     return buf_index;
8210 }
8211
8212 /**
8213  * returns the number of bytes consumed for building the current frame
8214  */
8215 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8216     if(s->flags&CODEC_FLAG_TRUNCATED){
8217         pos -= s->parse_context.last_index;
8218         if(pos<0) pos=0; // FIXME remove (unneeded?)
8219
8220         return pos;
8221     }else{
8222         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8223         if(pos+10>buf_size) pos=buf_size; // oops ;)
8224
8225         return pos;
8226     }
8227 }
8228
8229 static int decode_frame(AVCodecContext *avctx,
8230                              void *data, int *data_size,
8231                              uint8_t *buf, int buf_size)
8232 {
8233     H264Context *h = avctx->priv_data;
8234     MpegEncContext *s = &h->s;
8235     AVFrame *pict = data;
8236     int buf_index;
8237
8238     s->flags= avctx->flags;
8239     s->flags2= avctx->flags2;
8240
8241    /* no supplementary picture */
8242     if (buf_size == 0) {
8243         return 0;
8244     }
8245
8246     if(s->flags&CODEC_FLAG_TRUNCATED){
8247         int next= find_frame_end(h, buf, buf_size);
8248
8249         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8250             return buf_size;
8251 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8252     }
8253
8254     if(h->is_avc && !h->got_avcC) {
8255         int i, cnt, nalsize;
8256         unsigned char *p = avctx->extradata;
8257         if(avctx->extradata_size < 7) {
8258             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8259             return -1;
8260         }
8261         if(*p != 1) {
8262             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8263             return -1;
8264         }
8265         /* sps and pps in the avcC always have length coded with 2 bytes,
8266            so put a fake nal_length_size = 2 while parsing them */
8267         h->nal_length_size = 2;
8268         // Decode sps from avcC
8269         cnt = *(p+5) & 0x1f; // Number of sps
8270         p += 6;
8271         for (i = 0; i < cnt; i++) {
8272             nalsize = BE_16(p) + 2;
8273             if(decode_nal_units(h, p, nalsize) < 0) {
8274                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8275                 return -1;
8276             }
8277             p += nalsize;
8278         }
8279         // Decode pps from avcC
8280         cnt = *(p++); // Number of pps
8281         for (i = 0; i < cnt; i++) {
8282             nalsize = BE_16(p) + 2;
8283             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8284                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8285                 return -1;
8286             }
8287             p += nalsize;
8288         }
8289         // Now store right nal length size, that will be use to parse all other nals
8290         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8291         // Do not reparse avcC
8292         h->got_avcC = 1;
8293     }
8294
8295     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
8296         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8297             return -1;
8298     }
8299
8300     buf_index=decode_nal_units(h, buf, buf_size);
8301     if(buf_index < 0)
8302         return -1;
8303
8304     //FIXME do something with unavailable reference frames
8305
8306 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8307     if(!s->current_picture_ptr){
8308         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8309         return -1;
8310     }
8311
8312     {
8313         Picture *out = s->current_picture_ptr;
8314 #if 0 //decode order
8315         *data_size = sizeof(AVFrame);
8316 #else
8317         /* Sort B-frames into display order */
8318         Picture *cur = s->current_picture_ptr;
8319         Picture *prev = h->delayed_output_pic;
8320         int i, pics, cross_idr, out_of_order, out_idx;
8321
8322         if(h->sps.bitstream_restriction_flag
8323            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8324             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8325             s->low_delay = 0;
8326         }
8327
8328         pics = 0;
8329         while(h->delayed_pic[pics]) pics++;
8330         h->delayed_pic[pics++] = cur;
8331         if(cur->reference == 0)
8332             cur->reference = 1;
8333
8334         cross_idr = 0;
8335         for(i=0; h->delayed_pic[i]; i++)
8336             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8337                 cross_idr = 1;
8338
8339         out = h->delayed_pic[0];
8340         out_idx = 0;
8341         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8342             if(h->delayed_pic[i]->poc < out->poc){
8343                 out = h->delayed_pic[i];
8344                 out_idx = i;
8345             }
8346
8347         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8348         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8349             { }
8350         else if(prev && pics <= s->avctx->has_b_frames)
8351             out = prev;
8352         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8353            || (s->low_delay &&
8354             ((!cross_idr && prev && out->poc > prev->poc + 2)
8355              || cur->pict_type == B_TYPE)))
8356         {
8357             s->low_delay = 0;
8358             s->avctx->has_b_frames++;
8359             out = prev;
8360         }
8361         else if(out_of_order)
8362             out = prev;
8363
8364         if(out_of_order || pics > s->avctx->has_b_frames){
8365             for(i=out_idx; h->delayed_pic[i]; i++)
8366                 h->delayed_pic[i] = h->delayed_pic[i+1];
8367         }
8368
8369         if(prev == out)
8370             *data_size = 0;
8371         else
8372             *data_size = sizeof(AVFrame);
8373         if(prev && prev != out && prev->reference == 1)
8374             prev->reference = 0;
8375         h->delayed_output_pic = out;
8376 #endif
8377
8378         if(out)
8379             *pict= *(AVFrame*)out;
8380         else
8381             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8382     }
8383
8384     assert(pict->data[0] || !*data_size);
8385     ff_print_debug_info(s, pict);
8386 //printf("out %d\n", (int)pict->data[0]);
8387 #if 0 //?
8388
8389     /* Return the Picture timestamp as the frame number */
8390     /* we substract 1 because it is added on utils.c    */
8391     avctx->frame_number = s->picture_number - 1;
8392 #endif
8393     return get_consumed_bytes(s, buf_index, buf_size);
8394 }
8395 #if 0
8396 static inline void fill_mb_avail(H264Context *h){
8397     MpegEncContext * const s = &h->s;
8398     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8399
8400     if(s->mb_y){
8401         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8402         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8403         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8404     }else{
8405         h->mb_avail[0]=
8406         h->mb_avail[1]=
8407         h->mb_avail[2]= 0;
8408     }
8409     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8410     h->mb_avail[4]= 1; //FIXME move out
8411     h->mb_avail[5]= 0; //FIXME move out
8412 }
8413 #endif
8414
8415 #if 0 //selftest
8416 #define COUNT 8000
8417 #define SIZE (COUNT*40)
8418 int main(){
8419     int i;
8420     uint8_t temp[SIZE];
8421     PutBitContext pb;
8422     GetBitContext gb;
8423 //    int int_temp[10000];
8424     DSPContext dsp;
8425     AVCodecContext avctx;
8426
8427     dsputil_init(&dsp, &avctx);
8428
8429     init_put_bits(&pb, temp, SIZE);
8430     printf("testing unsigned exp golomb\n");
8431     for(i=0; i<COUNT; i++){
8432         START_TIMER
8433         set_ue_golomb(&pb, i);
8434         STOP_TIMER("set_ue_golomb");
8435     }
8436     flush_put_bits(&pb);
8437
8438     init_get_bits(&gb, temp, 8*SIZE);
8439     for(i=0; i<COUNT; i++){
8440         int j, s;
8441
8442         s= show_bits(&gb, 24);
8443
8444         START_TIMER
8445         j= get_ue_golomb(&gb);
8446         if(j != i){
8447             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8448 //            return -1;
8449         }
8450         STOP_TIMER("get_ue_golomb");
8451     }
8452
8453
8454     init_put_bits(&pb, temp, SIZE);
8455     printf("testing signed exp golomb\n");
8456     for(i=0; i<COUNT; i++){
8457         START_TIMER
8458         set_se_golomb(&pb, i - COUNT/2);
8459         STOP_TIMER("set_se_golomb");
8460     }
8461     flush_put_bits(&pb);
8462
8463     init_get_bits(&gb, temp, 8*SIZE);
8464     for(i=0; i<COUNT; i++){
8465         int j, s;
8466
8467         s= show_bits(&gb, 24);
8468
8469         START_TIMER
8470         j= get_se_golomb(&gb);
8471         if(j != i - COUNT/2){
8472             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8473 //            return -1;
8474         }
8475         STOP_TIMER("get_se_golomb");
8476     }
8477
8478     printf("testing 4x4 (I)DCT\n");
8479
8480     DCTELEM block[16];
8481     uint8_t src[16], ref[16];
8482     uint64_t error= 0, max_error=0;
8483
8484     for(i=0; i<COUNT; i++){
8485         int j;
8486 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8487         for(j=0; j<16; j++){
8488             ref[j]= random()%255;
8489             src[j]= random()%255;
8490         }
8491
8492         h264_diff_dct_c(block, src, ref, 4);
8493
8494         //normalize
8495         for(j=0; j<16; j++){
8496 //            printf("%d ", block[j]);
8497             block[j]= block[j]*4;
8498             if(j&1) block[j]= (block[j]*4 + 2)/5;
8499             if(j&4) block[j]= (block[j]*4 + 2)/5;
8500         }
8501 //        printf("\n");
8502
8503         s->dsp.h264_idct_add(ref, block, 4);
8504 /*        for(j=0; j<16; j++){
8505             printf("%d ", ref[j]);
8506         }
8507         printf("\n");*/
8508
8509         for(j=0; j<16; j++){
8510             int diff= FFABS(src[j] - ref[j]);
8511
8512             error+= diff*diff;
8513             max_error= FFMAX(max_error, diff);
8514         }
8515     }
8516     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8517 #if 0
8518     printf("testing quantizer\n");
8519     for(qp=0; qp<52; qp++){
8520         for(i=0; i<16; i++)
8521             src1_block[i]= src2_block[i]= random()%255;
8522
8523     }
8524 #endif
8525     printf("Testing NAL layer\n");
8526
8527     uint8_t bitstream[COUNT];
8528     uint8_t nal[COUNT*2];
8529     H264Context h;
8530     memset(&h, 0, sizeof(H264Context));
8531
8532     for(i=0; i<COUNT; i++){
8533         int zeros= i;
8534         int nal_length;
8535         int consumed;
8536         int out_length;
8537         uint8_t *out;
8538         int j;
8539
8540         for(j=0; j<COUNT; j++){
8541             bitstream[j]= (random() % 255) + 1;
8542         }
8543
8544         for(j=0; j<zeros; j++){
8545             int pos= random() % COUNT;
8546             while(bitstream[pos] == 0){
8547                 pos++;
8548                 pos %= COUNT;
8549             }
8550             bitstream[pos]=0;
8551         }
8552
8553         START_TIMER
8554
8555         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8556         if(nal_length<0){
8557             printf("encoding failed\n");
8558             return -1;
8559         }
8560
8561         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8562
8563         STOP_TIMER("NAL")
8564
8565         if(out_length != COUNT){
8566             printf("incorrect length %d %d\n", out_length, COUNT);
8567             return -1;
8568         }
8569
8570         if(consumed != nal_length){
8571             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8572             return -1;
8573         }
8574
8575         if(memcmp(bitstream, out, COUNT)){
8576             printf("missmatch\n");
8577             return -1;
8578         }
8579     }
8580
8581     printf("Testing RBSP\n");
8582
8583
8584     return 0;
8585 }
8586 #endif
8587
8588
8589 static int decode_end(AVCodecContext *avctx)
8590 {
8591     H264Context *h = avctx->priv_data;
8592     MpegEncContext *s = &h->s;
8593
8594     av_freep(&h->rbsp_buffer);
8595     free_tables(h); //FIXME cleanup init stuff perhaps
8596     MPV_common_end(s);
8597
8598 //    memset(h, 0, sizeof(H264Context));
8599
8600     return 0;
8601 }
8602
8603
8604 AVCodec h264_decoder = {
8605     "h264",
8606     CODEC_TYPE_VIDEO,
8607     CODEC_ID_H264,
8608     sizeof(H264Context),
8609     decode_init,
8610     NULL,
8611     decode_end,
8612     decode_frame,
8613     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8614     .flush= flush_dpb,
8615 };
8616
8617 #ifdef CONFIG_H264_PARSER
8618 AVCodecParser h264_parser = {
8619     { CODEC_ID_H264 },
8620     sizeof(H264Context),
8621     NULL,
8622     h264_parse,
8623     ff_parse_close,
8624     h264_split,
8625 };
8626 #endif
8627
8628 #include "svq3.c"