git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "common.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264data.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 #define interlaced_dct interlaced_dct_is_a_bad_name
  42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  43
  44 #define LUMA_DC_BLOCK_INDEX   25
  45 #define CHROMA_DC_BLOCK_INDEX 26
  46
  47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  48 #define COEFF_TOKEN_VLC_BITS           8
  49 #define TOTAL_ZEROS_VLC_BITS           9
  50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  51 #define RUN_VLC_BITS                   3
  52 #define RUN7_VLC_BITS                  6
  53
  54 #define MAX_SPS_COUNT 32
  55 #define MAX_PPS_COUNT 256
  56
  57 #define MAX_MMCO_COUNT 66
  58
  59 /* Compiling in interlaced support reduces the speed
  60  * of progressive decoding by about 2%. */
  61 #define ALLOW_INTERLACE
  62
  63 #ifdef ALLOW_INTERLACE
  64 #define MB_MBAFF h->mb_mbaff
  65 #define MB_FIELD h->mb_field_decoding_flag
  66 #define FRAME_MBAFF h->mb_aff_frame
  67 #else
  68 #define MB_MBAFF 0
  69 #define MB_FIELD 0
  70 #define FRAME_MBAFF 0
  71 #undef  IS_INTERLACED
  72 #define IS_INTERLACED(mb_type) 0
  73 #endif
  74
  75 /**
  76  * Sequence parameter set
  77  */
  78 typedef struct SPS{
  79
  80     int profile_idc;
  81     int level_idc;
  82     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  83     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  84     int poc_type;                      ///< pic_order_cnt_type
  85     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  86     int delta_pic_order_always_zero_flag;
  87     int offset_for_non_ref_pic;
  88     int offset_for_top_to_bottom_field;
  89     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  90     int ref_frame_count;               ///< num_ref_frames
  91     int gaps_in_frame_num_allowed_flag;
  92     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  93     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  94     int frame_mbs_only_flag;
  95     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  96     int direct_8x8_inference_flag;
  97     int crop;                   ///< frame_cropping_flag
  98     int crop_left;              ///< frame_cropping_rect_left_offset
  99     int crop_right;             ///< frame_cropping_rect_right_offset
 100     int crop_top;               ///< frame_cropping_rect_top_offset
 101     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 102     int vui_parameters_present_flag;
 103     AVRational sar;
 104     int timing_info_present_flag;
 105     uint32_t num_units_in_tick;
 106     uint32_t time_scale;
 107     int fixed_frame_rate_flag;
 108     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 109     int bitstream_restriction_flag;
 110     int num_reorder_frames;
 111     int scaling_matrix_present;
 112     uint8_t scaling_matrix4[6][16];
 113     uint8_t scaling_matrix8[2][64];
 114 }SPS;
 115
 116 /**
 117  * Picture parameter set
 118  */
 119 typedef struct PPS{
 120     int sps_id;
 121     int cabac;                  ///< entropy_coding_mode_flag
 122     int pic_order_present;      ///< pic_order_present_flag
 123     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 124     int mb_slice_group_map_type;
 125     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 126     int weighted_pred;          ///< weighted_pred_flag
 127     int weighted_bipred_idc;
 128     int init_qp;                ///< pic_init_qp_minus26 + 26
 129     int init_qs;                ///< pic_init_qs_minus26 + 26
 130     int chroma_qp_index_offset;
 131     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 132     int constrained_intra_pred; ///< constrained_intra_pred_flag
 133     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 134     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 135     uint8_t scaling_matrix4[6][16];
 136     uint8_t scaling_matrix8[2][64];
 137 }PPS;
 138
 139 /**
 140  * Memory management control operation opcode.
 141  */
 142 typedef enum MMCOOpcode{
 143     MMCO_END=0,
 144     MMCO_SHORT2UNUSED,
 145     MMCO_LONG2UNUSED,
 146     MMCO_SHORT2LONG,
 147     MMCO_SET_MAX_LONG,
 148     MMCO_RESET,
 149     MMCO_LONG,
 150 } MMCOOpcode;
 151
 152 /**
 153  * Memory management control operation.
 154  */
 155 typedef struct MMCO{
 156     MMCOOpcode opcode;
 157     int short_frame_num;
 158     int long_index;
 159 } MMCO;
 160
 161 /**
 162  * H264Context
 163  */
 164 typedef struct H264Context{
 165     MpegEncContext s;
 166     int nal_ref_idc;
 167     int nal_unit_type;
 168 #define NAL_SLICE                1
 169 #define NAL_DPA                  2
 170 #define NAL_DPB                  3
 171 #define NAL_DPC                  4
 172 #define NAL_IDR_SLICE            5
 173 #define NAL_SEI                  6
 174 #define NAL_SPS                  7
 175 #define NAL_PPS                  8
 176 #define NAL_AUD                  9
 177 #define NAL_END_SEQUENCE        10
 178 #define NAL_END_STREAM          11
 179 #define NAL_FILLER_DATA         12
 180 #define NAL_SPS_EXT             13
 181 #define NAL_AUXILIARY_SLICE     19
 182     uint8_t *rbsp_buffer;
 183     unsigned int rbsp_buffer_size;
 184
 185     /**
 186       * Used to parse AVC variant of h264
 187       */
 188     int is_avc; ///< this flag is != 0 if codec is avc1
 189     int got_avcC; ///< flag used to parse avcC data only once
 190     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 191
 192     int chroma_qp; //QPc
 193
 194     int prev_mb_skipped;
 195     int next_mb_skipped;
 196
 197     //prediction stuff
 198     int chroma_pred_mode;
 199     int intra16x16_pred_mode;
 200
 201     int top_mb_xy;
 202     int left_mb_xy[2];
 203
 204     int8_t intra4x4_pred_mode_cache[5*8];
 205     int8_t (*intra4x4_pred_mode)[8];
 206     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 207     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 208     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 209     void (*pred16x16[4+3])(uint8_t *src, int stride);
 210     unsigned int topleft_samples_available;
 211     unsigned int top_samples_available;
 212     unsigned int topright_samples_available;
 213     unsigned int left_samples_available;
 214     uint8_t (*top_borders[2])[16+2*8];
 215     uint8_t left_border[2*(17+2*9)];
 216
 217     /**
 218      * non zero coeff count cache.
 219      * is 64 if not available.
 220      */
 221     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 222     uint8_t (*non_zero_count)[16];
 223
 224     /**
 225      * Motion vector cache.
 226      */
 227     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 228     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 229 #define LIST_NOT_USED -1 //FIXME rename?
 230 #define PART_NOT_AVAILABLE -2
 231
 232     /**
 233      * is 1 if the specific list MV&references are set to 0,0,-2.
 234      */
 235     int mv_cache_clean[2];
 236
 237     /**
 238      * number of neighbors (top and/or left) that used 8x8 dct
 239      */
 240     int neighbor_transform_size;
 241
 242     /**
 243      * block_offset[ 0..23] for frame macroblocks
 244      * block_offset[24..47] for field macroblocks
 245      */
 246     int block_offset[2*(16+8)];
 247
 248     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 249     uint32_t *mb2b8_xy;
 250     int b_stride; //FIXME use s->b4_stride
 251     int b8_stride;
 252
 253     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 254     int mb_uvlinesize;
 255
 256     int emu_edge_width;
 257     int emu_edge_height;
 258
 259     int halfpel_flag;
 260     int thirdpel_flag;
 261
 262     int unknown_svq3_flag;
 263     int next_slice_index;
 264
 265     SPS sps_buffer[MAX_SPS_COUNT];
 266     SPS sps; ///< current sps
 267
 268     PPS pps_buffer[MAX_PPS_COUNT];
 269     /**
 270      * current pps
 271      */
 272     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 273
 274     uint32_t dequant4_buffer[6][52][16];
 275     uint32_t dequant8_buffer[2][52][64];
 276     uint32_t (*dequant4_coeff[6])[16];
 277     uint32_t (*dequant8_coeff[2])[64];
 278     int dequant_coeff_pps;     ///< reinit tables when pps changes
 279
 280     int slice_num;
 281     uint8_t *slice_table_base;
 282     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 283     int slice_type;
 284     int slice_type_fixed;
 285
 286     //interlacing specific flags
 287     int mb_aff_frame;
 288     int mb_field_decoding_flag;
 289     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 290
 291     int sub_mb_type[4];
 292
 293     //POC stuff
 294     int poc_lsb;
 295     int poc_msb;
 296     int delta_poc_bottom;
 297     int delta_poc[2];
 298     int frame_num;
 299     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 300     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 301     int frame_num_offset;         ///< for POC type 2
 302     int prev_frame_num_offset;    ///< for POC type 2
 303     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 304
 305     /**
 306      * frame_num for frames or 2*frame_num for field pics.
 307      */
 308     int curr_pic_num;
 309
 310     /**
 311      * max_frame_num or 2*max_frame_num for field pics.
 312      */
 313     int max_pic_num;
 314
 315     //Weighted pred stuff
 316     int use_weight;
 317     int use_weight_chroma;
 318     int luma_log2_weight_denom;
 319     int chroma_log2_weight_denom;
 320     int luma_weight[2][48];
 321     int luma_offset[2][48];
 322     int chroma_weight[2][48][2];
 323     int chroma_offset[2][48][2];
 324     int implicit_weight[48][48];
 325
 326     //deblock
 327     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 328     int slice_alpha_c0_offset;
 329     int slice_beta_offset;
 330
 331     int redundant_pic_count;
 332
 333     int direct_spatial_mv_pred;
 334     int dist_scale_factor[16];
 335     int dist_scale_factor_field[32];
 336     int map_col_to_list0[2][16];
 337     int map_col_to_list0_field[2][32];
 338
 339     /**
 340      * num_ref_idx_l0/1_active_minus1 + 1
 341      */
 342     int ref_count[2];            ///< counts frames or fields, depending on current mb mode
 343     Picture *short_ref[32];
 344     Picture *long_ref[32];
 345     Picture default_ref_list[2][32];
 346     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 347     Picture *delayed_pic[16]; //FIXME size?
 348     Picture *delayed_output_pic;
 349
 350     /**
 351      * memory management control operations buffer.
 352      */
 353     MMCO mmco[MAX_MMCO_COUNT];
 354     int mmco_index;
 355
 356     int long_ref_count;  ///< number of actual long term references
 357     int short_ref_count; ///< number of actual short term references
 358
 359     //data partitioning
 360     GetBitContext intra_gb;
 361     GetBitContext inter_gb;
 362     GetBitContext *intra_gb_ptr;
 363     GetBitContext *inter_gb_ptr;
 364
 365     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 366
 367     /**
 368      * Cabac
 369      */
 370     CABACContext cabac;
 371     uint8_t      cabac_state[460];
 372     int          cabac_init_idc;
 373
 374     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 375     uint16_t     *cbp_table;
 376     int cbp;
 377     int top_cbp;
 378     int left_cbp;
 379     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 380     uint8_t     *chroma_pred_mode_table;
 381     int         last_qscale_diff;
 382     int16_t     (*mvd_table[2])[2];
 383     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 384     uint8_t     *direct_table;
 385     uint8_t     direct_cache[5*8];
 386
 387     uint8_t zigzag_scan[16];
 388     uint8_t zigzag_scan8x8[64];
 389     uint8_t zigzag_scan8x8_cavlc[64];
 390     uint8_t field_scan[16];
 391     uint8_t field_scan8x8[64];
 392     uint8_t field_scan8x8_cavlc[64];
 393     const uint8_t *zigzag_scan_q0;
 394     const uint8_t *zigzag_scan8x8_q0;
 395     const uint8_t *zigzag_scan8x8_cavlc_q0;
 396     const uint8_t *field_scan_q0;
 397     const uint8_t *field_scan8x8_q0;
 398     const uint8_t *field_scan8x8_cavlc_q0;
 399
 400     int x264_build;
 401 }H264Context;
 402
 403 static VLC coeff_token_vlc[4];
 404 static VLC chroma_dc_coeff_token_vlc;
 405
 406 static VLC total_zeros_vlc[15];
 407 static VLC chroma_dc_total_zeros_vlc[3];
 408
 409 static VLC run_vlc[6];
 410 static VLC run7_vlc;
 411
 412 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 413 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 414 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 415 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 416
 417 static always_inline uint32_t pack16to32(int a, int b){
 418 #ifdef WORDS_BIGENDIAN
 419    return (b&0xFFFF) + (a<<16);
 420 #else
 421    return (a&0xFFFF) + (b<<16);
 422 #endif
 423 }
 424
 425 /**
 426  * fill a rectangle.
 427  * @param h height of the rectangle, should be a constant
 428  * @param w width of the rectangle, should be a constant
 429  * @param size the size of val (1 or 4), should be a constant
 430  */
 431 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 432     uint8_t *p= (uint8_t*)vp;
 433     assert(size==1 || size==4);
 434     assert(w<=4);
 435
 436     w      *= size;
 437     stride *= size;
 438
 439     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 440     assert((stride&(w-1))==0);
 441     if(w==2){
 442         const uint16_t v= size==4 ? val : val*0x0101;
 443         *(uint16_t*)(p + 0*stride)= v;
 444         if(h==1) return;
 445         *(uint16_t*)(p + 1*stride)= v;
 446         if(h==2) return;
 447         *(uint16_t*)(p + 2*stride)=
 448         *(uint16_t*)(p + 3*stride)= v;
 449     }else if(w==4){
 450         const uint32_t v= size==4 ? val : val*0x01010101;
 451         *(uint32_t*)(p + 0*stride)= v;
 452         if(h==1) return;
 453         *(uint32_t*)(p + 1*stride)= v;
 454         if(h==2) return;
 455         *(uint32_t*)(p + 2*stride)=
 456         *(uint32_t*)(p + 3*stride)= v;
 457     }else if(w==8){
 458     //gcc can't optimize 64bit math on x86_32
 459 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 460         const uint64_t v= val*0x0100000001ULL;
 461         *(uint64_t*)(p + 0*stride)= v;
 462         if(h==1) return;
 463         *(uint64_t*)(p + 1*stride)= v;
 464         if(h==2) return;
 465         *(uint64_t*)(p + 2*stride)=
 466         *(uint64_t*)(p + 3*stride)= v;
 467     }else if(w==16){
 468         const uint64_t v= val*0x0100000001ULL;
 469         *(uint64_t*)(p + 0+0*stride)=
 470         *(uint64_t*)(p + 8+0*stride)=
 471         *(uint64_t*)(p + 0+1*stride)=
 472         *(uint64_t*)(p + 8+1*stride)= v;
 473         if(h==2) return;
 474         *(uint64_t*)(p + 0+2*stride)=
 475         *(uint64_t*)(p + 8+2*stride)=
 476         *(uint64_t*)(p + 0+3*stride)=
 477         *(uint64_t*)(p + 8+3*stride)= v;
 478 #else
 479         *(uint32_t*)(p + 0+0*stride)=
 480         *(uint32_t*)(p + 4+0*stride)= val;
 481         if(h==1) return;
 482         *(uint32_t*)(p + 0+1*stride)=
 483         *(uint32_t*)(p + 4+1*stride)= val;
 484         if(h==2) return;
 485         *(uint32_t*)(p + 0+2*stride)=
 486         *(uint32_t*)(p + 4+2*stride)=
 487         *(uint32_t*)(p + 0+3*stride)=
 488         *(uint32_t*)(p + 4+3*stride)= val;
 489     }else if(w==16){
 490         *(uint32_t*)(p + 0+0*stride)=
 491         *(uint32_t*)(p + 4+0*stride)=
 492         *(uint32_t*)(p + 8+0*stride)=
 493         *(uint32_t*)(p +12+0*stride)=
 494         *(uint32_t*)(p + 0+1*stride)=
 495         *(uint32_t*)(p + 4+1*stride)=
 496         *(uint32_t*)(p + 8+1*stride)=
 497         *(uint32_t*)(p +12+1*stride)= val;
 498         if(h==2) return;
 499         *(uint32_t*)(p + 0+2*stride)=
 500         *(uint32_t*)(p + 4+2*stride)=
 501         *(uint32_t*)(p + 8+2*stride)=
 502         *(uint32_t*)(p +12+2*stride)=
 503         *(uint32_t*)(p + 0+3*stride)=
 504         *(uint32_t*)(p + 4+3*stride)=
 505         *(uint32_t*)(p + 8+3*stride)=
 506         *(uint32_t*)(p +12+3*stride)= val;
 507 #endif
 508     }else
 509         assert(0);
 510     assert(h==4);
 511 }
 512
 513 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 514     MpegEncContext * const s = &h->s;
 515     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 516     int topleft_xy, top_xy, topright_xy, left_xy[2];
 517     int topleft_type, top_type, topright_type, left_type[2];
 518     int left_block[8];
 519     int i;
 520
 521     //FIXME deblocking could skip the intra and nnz parts.
 522     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 523         return;
 524
 525     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 526
 527     top_xy     = mb_xy  - s->mb_stride;
 528     topleft_xy = top_xy - 1;
 529     topright_xy= top_xy + 1;
 530     left_xy[1] = left_xy[0] = mb_xy-1;
 531     left_block[0]= 0;
 532     left_block[1]= 1;
 533     left_block[2]= 2;
 534     left_block[3]= 3;
 535     left_block[4]= 7;
 536     left_block[5]= 10;
 537     left_block[6]= 8;
 538     left_block[7]= 11;
 539     if(FRAME_MBAFF){
 540         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 541         const int top_pair_xy      = pair_xy     - s->mb_stride;
 542         const int topleft_pair_xy  = top_pair_xy - 1;
 543         const int topright_pair_xy = top_pair_xy + 1;
 544         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 545         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 546         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 547         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 548         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 549         const int bottom = (s->mb_y & 1);
 550         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 551         if (bottom
 552                 ? !curr_mb_frame_flag // bottom macroblock
 553                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 554                 ) {
 555             top_xy -= s->mb_stride;
 556         }
 557         if (bottom
 558                 ? !curr_mb_frame_flag // bottom macroblock
 559                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 560                 ) {
 561             topleft_xy -= s->mb_stride;
 562         }
 563         if (bottom
 564                 ? !curr_mb_frame_flag // bottom macroblock
 565                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 566                 ) {
 567             topright_xy -= s->mb_stride;
 568         }
 569         if (left_mb_frame_flag != curr_mb_frame_flag) {
 570             left_xy[1] = left_xy[0] = pair_xy - 1;
 571             if (curr_mb_frame_flag) {
 572                 if (bottom) {
 573                     left_block[0]= 2;
 574                     left_block[1]= 2;
 575                     left_block[2]= 3;
 576                     left_block[3]= 3;
 577                     left_block[4]= 8;
 578                     left_block[5]= 11;
 579                     left_block[6]= 8;
 580                     left_block[7]= 11;
 581                 } else {
 582                     left_block[0]= 0;
 583                     left_block[1]= 0;
 584                     left_block[2]= 1;
 585                     left_block[3]= 1;
 586                     left_block[4]= 7;
 587                     left_block[5]= 10;
 588                     left_block[6]= 7;
 589                     left_block[7]= 10;
 590                 }
 591             } else {
 592                 left_xy[1] += s->mb_stride;
 593                 //left_block[0]= 0;
 594                 left_block[1]= 2;
 595                 left_block[2]= 0;
 596                 left_block[3]= 2;
 597                 //left_block[4]= 7;
 598                 left_block[5]= 10;
 599                 left_block[6]= 7;
 600                 left_block[7]= 10;
 601             }
 602         }
 603     }
 604
 605     h->top_mb_xy = top_xy;
 606     h->left_mb_xy[0] = left_xy[0];
 607     h->left_mb_xy[1] = left_xy[1];
 608     if(for_deblock){
 609         topleft_type = 0;
 610         topright_type = 0;
 611         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 612         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 613         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 614
 615         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 616             int list;
 617             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 618             for(i=0; i<16; i++)
 619                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 620             for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 621                 if(USES_LIST(mb_type,list)){
 622                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 623                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 624                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 625                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 626                         dst[0] = src[0];
 627                         dst[1] = src[1];
 628                         dst[2] = src[2];
 629                         dst[3] = src[3];
 630                     }
 631                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 632                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 633                     ref += h->b8_stride;
 634                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 635                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 636                 }else{
 637                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 638                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 639                 }
 640             }
 641         }
 642     }else{
 643         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 644         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 645         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 646         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 647         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 648     }
 649
 650     if(IS_INTRA(mb_type)){
 651         h->topleft_samples_available=
 652         h->top_samples_available=
 653         h->left_samples_available= 0xFFFF;
 654         h->topright_samples_available= 0xEEEA;
 655
 656         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 657             h->topleft_samples_available= 0xB3FF;
 658             h->top_samples_available= 0x33FF;
 659             h->topright_samples_available= 0x26EA;
 660         }
 661         for(i=0; i<2; i++){
 662             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 663                 h->topleft_samples_available&= 0xDF5F;
 664                 h->left_samples_available&= 0x5F5F;
 665             }
 666         }
 667
 668         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 669             h->topleft_samples_available&= 0x7FFF;
 670
 671         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 672             h->topright_samples_available&= 0xFBFF;
 673
 674         if(IS_INTRA4x4(mb_type)){
 675             if(IS_INTRA4x4(top_type)){
 676                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 677                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 678                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 679                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 680             }else{
 681                 int pred;
 682                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 683                     pred= -1;
 684                 else{
 685                     pred= 2;
 686                 }
 687                 h->intra4x4_pred_mode_cache[4+8*0]=
 688                 h->intra4x4_pred_mode_cache[5+8*0]=
 689                 h->intra4x4_pred_mode_cache[6+8*0]=
 690                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 691             }
 692             for(i=0; i<2; i++){
 693                 if(IS_INTRA4x4(left_type[i])){
 694                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 695                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 696                 }else{
 697                     int pred;
 698                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 699                         pred= -1;
 700                     else{
 701                         pred= 2;
 702                     }
 703                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 704                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 705                 }
 706             }
 707         }
 708     }
 709
 710
 711 /*
 712 0 . T T. T T T T
 713 1 L . .L . . . .
 714 2 L . .L . . . .
 715 3 . T TL . . . .
 716 4 L . .L . . . .
 717 5 L . .. . . . .
 718 */
 719 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 720     if(top_type){
 721         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 722         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 723         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 724         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 725
 726         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 727         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 728
 729         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 730         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 731
 732     }else{
 733         h->non_zero_count_cache[4+8*0]=
 734         h->non_zero_count_cache[5+8*0]=
 735         h->non_zero_count_cache[6+8*0]=
 736         h->non_zero_count_cache[7+8*0]=
 737
 738         h->non_zero_count_cache[1+8*0]=
 739         h->non_zero_count_cache[2+8*0]=
 740
 741         h->non_zero_count_cache[1+8*3]=
 742         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 743
 744     }
 745
 746     for (i=0; i<2; i++) {
 747         if(left_type[i]){
 748             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 749             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 750             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 751             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 752         }else{
 753             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 754             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 755             h->non_zero_count_cache[0+8*1 +   8*i]=
 756             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 757         }
 758     }
 759
 760     if( h->pps.cabac ) {
 761         // top_cbp
 762         if(top_type) {
 763             h->top_cbp = h->cbp_table[top_xy];
 764         } else if(IS_INTRA(mb_type)) {
 765             h->top_cbp = 0x1C0;
 766         } else {
 767             h->top_cbp = 0;
 768         }
 769         // left_cbp
 770         if (left_type[0]) {
 771             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 772         } else if(IS_INTRA(mb_type)) {
 773             h->left_cbp = 0x1C0;
 774         } else {
 775             h->left_cbp = 0;
 776         }
 777         if (left_type[0]) {
 778             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 779         }
 780         if (left_type[1]) {
 781             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 782         }
 783     }
 784
 785 #if 1
 786     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 787         int list;
 788         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 789             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 790                 /*if(!h->mv_cache_clean[list]){
 791                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 792                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 793                     h->mv_cache_clean[list]= 1;
 794                 }*/
 795                 continue;
 796             }
 797             h->mv_cache_clean[list]= 0;
 798
 799             if(USES_LIST(top_type, list)){
 800                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 801                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 802                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 803                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 804                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 805                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 806                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 807                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 808                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 809                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 810             }else{
 811                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 812                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 813                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 814                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 815                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 816             }
 817
 818             //FIXME unify cleanup or sth
 819             if(USES_LIST(left_type[0], list)){
 820                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 821                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 822                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 823                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 824                 h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 825                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
 826             }else{
 827                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 828                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 829                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 830                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 831             }
 832
 833             if(USES_LIST(left_type[1], list)){
 834                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 835                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 836                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 837                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 838                 h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 839                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
 840             }else{
 841                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 842                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 843                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 844                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 845                 assert((!left_type[0]) == (!left_type[1]));
 846             }
 847
 848             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 849                 continue;
 850
 851             if(USES_LIST(topleft_type, list)){
 852                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 853                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 854                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 855                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 856             }else{
 857                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 858                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 859             }
 860
 861             if(USES_LIST(topright_type, list)){
 862                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 863                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 864                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 865                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 866             }else{
 867                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 868                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 869             }
 870
 871             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 872                 continue;
 873
 874             h->ref_cache[list][scan8[5 ]+1] =
 875             h->ref_cache[list][scan8[7 ]+1] =
 876             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 877             h->ref_cache[list][scan8[4 ]] =
 878             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 879             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 880             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 881             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 882             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 883             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 884
 885             if( h->pps.cabac ) {
 886                 /* XXX beurk, Load mvd */
 887                 if(USES_LIST(top_type, list)){
 888                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 889                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 890                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 891                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 892                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 893                 }else{
 894                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 895                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 896                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 897                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 898                 }
 899                 if(USES_LIST(left_type[0], list)){
 900                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 901                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 902                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 903                 }else{
 904                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 905                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 906                 }
 907                 if(USES_LIST(left_type[1], list)){
 908                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 909                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 910                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 911                 }else{
 912                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 913                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 914                 }
 915                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 916                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 917                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 918                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 919                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 920
 921                 if(h->slice_type == B_TYPE){
 922                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 923
 924                     if(IS_DIRECT(top_type)){
 925                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 926                     }else if(IS_8X8(top_type)){
 927                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 928                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 929                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 930                     }else{
 931                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 932                     }
 933
 934                     if(IS_DIRECT(left_type[0]))
 935                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 936                     else if(IS_8X8(left_type[0]))
 937                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 938                     else
 939                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 940
 941                     if(IS_DIRECT(left_type[1]))
 942                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 943                     else if(IS_8X8(left_type[1]))
 944                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 945                     else
 946                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 947                 }
 948             }
 949
 950             if(FRAME_MBAFF){
 951 #define MAP_MVS\
 952                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 953                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 954                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 955                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 956                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 957                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 958                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 959                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 960                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 961                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 962                 if(MB_FIELD){
 963 #define MAP_F2F(idx, mb_type)\
 964                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 965                         h->ref_cache[list][idx] <<= 1;\
 966                         h->mv_cache[list][idx][1] /= 2;\
 967                         h->mvd_cache[list][idx][1] /= 2;\
 968                     }
 969                     MAP_MVS
 970 #undef MAP_F2F
 971                 }else{
 972 #define MAP_F2F(idx, mb_type)\
 973                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 974                         h->ref_cache[list][idx] >>= 1;\
 975                         h->mv_cache[list][idx][1] <<= 1;\
 976                         h->mvd_cache[list][idx][1] <<= 1;\
 977                     }
 978                     MAP_MVS
 979 #undef MAP_F2F
 980                 }
 981             }
 982         }
 983     }
 984 #endif
 985
 986     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 987 }
 988
 989 static inline void write_back_intra_pred_mode(H264Context *h){
 990     MpegEncContext * const s = &h->s;
 991     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 992
 993     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 994     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 995     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 996     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 997     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 998     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 999     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
1000 }
1001
1002 /**
1003  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1004  */
1005 static inline int check_intra4x4_pred_mode(H264Context *h){
1006     MpegEncContext * const s = &h->s;
1007     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1008     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1009     int i;
1010
1011     if(!(h->top_samples_available&0x8000)){
1012         for(i=0; i<4; i++){
1013             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1014             if(status<0){
1015                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1016                 return -1;
1017             } else if(status){
1018                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1019             }
1020         }
1021     }
1022
1023     if(!(h->left_samples_available&0x8000)){
1024         for(i=0; i<4; i++){
1025             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1026             if(status<0){
1027                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1028                 return -1;
1029             } else if(status){
1030                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1031             }
1032         }
1033     }
1034
1035     return 0;
1036 } //FIXME cleanup like next
1037
1038 /**
1039  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1040  */
1041 static inline int check_intra_pred_mode(H264Context *h, int mode){
1042     MpegEncContext * const s = &h->s;
1043     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1044     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1045
1046     if(mode < 0 || mode > 6) {
1047         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1048         return -1;
1049     }
1050
1051     if(!(h->top_samples_available&0x8000)){
1052         mode= top[ mode ];
1053         if(mode<0){
1054             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1055             return -1;
1056         }
1057     }
1058
1059     if(!(h->left_samples_available&0x8000)){
1060         mode= left[ mode ];
1061         if(mode<0){
1062             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1063             return -1;
1064         }
1065     }
1066
1067     return mode;
1068 }
1069
1070 /**
1071  * gets the predicted intra4x4 prediction mode.
1072  */
1073 static inline int pred_intra_mode(H264Context *h, int n){
1074     const int index8= scan8[n];
1075     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1076     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1077     const int min= FFMIN(left, top);
1078
1079     tprintf("mode:%d %d min:%d\n", left ,top, min);
1080
1081     if(min<0) return DC_PRED;
1082     else      return min;
1083 }
1084
1085 static inline void write_back_non_zero_count(H264Context *h){
1086     MpegEncContext * const s = &h->s;
1087     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1088
1089     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1090     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1091     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1092     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1093     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1094     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1095     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1096
1097     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1098     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1099     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1100
1101     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1102     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1103     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1104
1105     if(FRAME_MBAFF){
1106         // store all luma nnzs, for deblocking
1107         int v = 0, i;
1108         for(i=0; i<16; i++)
1109             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1110         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1111     }
1112 }
1113
1114 /**
1115  * gets the predicted number of non zero coefficients.
1116  * @param n block index
1117  */
1118 static inline int pred_non_zero_count(H264Context *h, int n){
1119     const int index8= scan8[n];
1120     const int left= h->non_zero_count_cache[index8 - 1];
1121     const int top = h->non_zero_count_cache[index8 - 8];
1122     int i= left + top;
1123
1124     if(i<64) i= (i+1)>>1;
1125
1126     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1127
1128     return i&31;
1129 }
1130
1131 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1132     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1133
1134     /* there is no consistent mapping of mvs to neighboring locations that will
1135      * make mbaff happy, so we can't move all this logic to fill_caches */
1136     if(FRAME_MBAFF){
1137         MpegEncContext *s = &h->s;
1138         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1139         const int16_t *mv;
1140         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1141         *C = h->mv_cache[list][scan8[0]-2];
1142
1143         if(!MB_FIELD
1144            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1145             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1146             if(IS_INTERLACED(mb_types[topright_xy])){
1147 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1148                 const int x4 = X4, y4 = Y4;\
1149                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1150                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1151                     return LIST_NOT_USED;\
1152                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1153                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1154                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1155                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1156
1157                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1158             }
1159         }
1160         if(topright_ref == PART_NOT_AVAILABLE
1161            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1162            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1163             if(!MB_FIELD
1164                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1165                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1166             }
1167             if(MB_FIELD
1168                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1169                && i >= scan8[0]+8){
1170                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1171                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1172             }
1173         }
1174 #undef SET_DIAG_MV
1175     }
1176
1177     if(topright_ref != PART_NOT_AVAILABLE){
1178         *C= h->mv_cache[list][ i - 8 + part_width ];
1179         return topright_ref;
1180     }else{
1181         tprintf("topright MV not available\n");
1182
1183         *C= h->mv_cache[list][ i - 8 - 1 ];
1184         return h->ref_cache[list][ i - 8 - 1 ];
1185     }
1186 }
1187
1188 /**
1189  * gets the predicted MV.
1190  * @param n the block index
1191  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1192  * @param mx the x component of the predicted motion vector
1193  * @param my the y component of the predicted motion vector
1194  */
1195 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1196     const int index8= scan8[n];
1197     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1198     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1199     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1200     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1201     const int16_t * C;
1202     int diagonal_ref, match_count;
1203
1204     assert(part_width==1 || part_width==2 || part_width==4);
1205
1206 /* mv_cache
1207   B . . A T T T T
1208   U . . L . . , .
1209   U . . L . . . .
1210   U . . L . . , .
1211   . . . L . . . .
1212 */
1213
1214     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1215     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1216     tprintf("pred_motion match_count=%d\n", match_count);
1217     if(match_count > 1){ //most common
1218         *mx= mid_pred(A[0], B[0], C[0]);
1219         *my= mid_pred(A[1], B[1], C[1]);
1220     }else if(match_count==1){
1221         if(left_ref==ref){
1222             *mx= A[0];
1223             *my= A[1];
1224         }else if(top_ref==ref){
1225             *mx= B[0];
1226             *my= B[1];
1227         }else{
1228             *mx= C[0];
1229             *my= C[1];
1230         }
1231     }else{
1232         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1233             *mx= A[0];
1234             *my= A[1];
1235         }else{
1236             *mx= mid_pred(A[0], B[0], C[0]);
1237             *my= mid_pred(A[1], B[1], C[1]);
1238         }
1239     }
1240
1241     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1242 }
1243
1244 /**
1245  * gets the directionally predicted 16x8 MV.
1246  * @param n the block index
1247  * @param mx the x component of the predicted motion vector
1248  * @param my the y component of the predicted motion vector
1249  */
1250 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1251     if(n==0){
1252         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1253         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1254
1255         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1256
1257         if(top_ref == ref){
1258             *mx= B[0];
1259             *my= B[1];
1260             return;
1261         }
1262     }else{
1263         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1264         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1265
1266         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1267
1268         if(left_ref == ref){
1269             *mx= A[0];
1270             *my= A[1];
1271             return;
1272         }
1273     }
1274
1275     //RARE
1276     pred_motion(h, n, 4, list, ref, mx, my);
1277 }
1278
1279 /**
1280  * gets the directionally predicted 8x16 MV.
1281  * @param n the block index
1282  * @param mx the x component of the predicted motion vector
1283  * @param my the y component of the predicted motion vector
1284  */
1285 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1286     if(n==0){
1287         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1288         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1289
1290         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1291
1292         if(left_ref == ref){
1293             *mx= A[0];
1294             *my= A[1];
1295             return;
1296         }
1297     }else{
1298         const int16_t * C;
1299         int diagonal_ref;
1300
1301         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1302
1303         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1304
1305         if(diagonal_ref == ref){
1306             *mx= C[0];
1307             *my= C[1];
1308             return;
1309         }
1310     }
1311
1312     //RARE
1313     pred_motion(h, n, 2, list, ref, mx, my);
1314 }
1315
1316 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1317     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1318     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1319
1320     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1321
1322     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1323        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1324        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1325
1326         *mx = *my = 0;
1327         return;
1328     }
1329
1330     pred_motion(h, 0, 4, 0, 0, mx, my);
1331
1332     return;
1333 }
1334
1335 static inline void direct_dist_scale_factor(H264Context * const h){
1336     const int poc = h->s.current_picture_ptr->poc;
1337     const int poc1 = h->ref_list[1][0].poc;
1338     int i;
1339     for(i=0; i<h->ref_count[0]; i++){
1340         int poc0 = h->ref_list[0][i].poc;
1341         int td = clip(poc1 - poc0, -128, 127);
1342         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1343             h->dist_scale_factor[i] = 256;
1344         }else{
1345             int tb = clip(poc - poc0, -128, 127);
1346             int tx = (16384 + (ABS(td) >> 1)) / td;
1347             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1348         }
1349     }
1350     if(FRAME_MBAFF){
1351         for(i=0; i<h->ref_count[0]; i++){
1352             h->dist_scale_factor_field[2*i] =
1353             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1354         }
1355     }
1356 }
1357 static inline void direct_ref_list_init(H264Context * const h){
1358     MpegEncContext * const s = &h->s;
1359     Picture * const ref1 = &h->ref_list[1][0];
1360     Picture * const cur = s->current_picture_ptr;
1361     int list, i, j;
1362     if(cur->pict_type == I_TYPE)
1363         cur->ref_count[0] = 0;
1364     if(cur->pict_type != B_TYPE)
1365         cur->ref_count[1] = 0;
1366     for(list=0; list<2; list++){
1367         cur->ref_count[list] = h->ref_count[list];
1368         for(j=0; j<h->ref_count[list]; j++)
1369             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1370     }
1371     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1372         return;
1373     for(list=0; list<2; list++){
1374         for(i=0; i<ref1->ref_count[list]; i++){
1375             const int poc = ref1->ref_poc[list][i];
1376             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1377             for(j=0; j<h->ref_count[list]; j++)
1378                 if(h->ref_list[list][j].poc == poc){
1379                     h->map_col_to_list0[list][i] = j;
1380                     break;
1381                 }
1382         }
1383     }
1384     if(FRAME_MBAFF){
1385         for(list=0; list<2; list++){
1386             for(i=0; i<ref1->ref_count[list]; i++){
1387                 j = h->map_col_to_list0[list][i];
1388                 h->map_col_to_list0_field[list][2*i] = 2*j;
1389                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1390             }
1391         }
1392     }
1393 }
1394
1395 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1396     MpegEncContext * const s = &h->s;
1397     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1398     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1399     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1400     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1401     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1402     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1403     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1404     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1405     const int is_b8x8 = IS_8X8(*mb_type);
1406     int sub_mb_type;
1407     int i8, i4;
1408
1409 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1410     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1411         /* FIXME save sub mb types from previous frames (or derive from MVs)
1412          * so we know exactly what block size to use */
1413         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1414         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1415     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1416         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1417         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1418     }else{
1419         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1420         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1421     }
1422     if(!is_b8x8)
1423         *mb_type |= MB_TYPE_DIRECT2;
1424     if(MB_FIELD)
1425         *mb_type |= MB_TYPE_INTERLACED;
1426
1427     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1428
1429     if(h->direct_spatial_mv_pred){
1430         int ref[2];
1431         int mv[2][2];
1432         int list;
1433
1434         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1435
1436         /* ref = min(neighbors) */
1437         for(list=0; list<2; list++){
1438             int refa = h->ref_cache[list][scan8[0] - 1];
1439             int refb = h->ref_cache[list][scan8[0] - 8];
1440             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1441             if(refc == -2)
1442                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1443             ref[list] = refa;
1444             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1445                 ref[list] = refb;
1446             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1447                 ref[list] = refc;
1448             if(ref[list] < 0)
1449                 ref[list] = -1;
1450         }
1451
1452         if(ref[0] < 0 && ref[1] < 0){
1453             ref[0] = ref[1] = 0;
1454             mv[0][0] = mv[0][1] =
1455             mv[1][0] = mv[1][1] = 0;
1456         }else{
1457             for(list=0; list<2; list++){
1458                 if(ref[list] >= 0)
1459                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1460                 else
1461                     mv[list][0] = mv[list][1] = 0;
1462             }
1463         }
1464
1465         if(ref[1] < 0){
1466             *mb_type &= ~MB_TYPE_P0L1;
1467             sub_mb_type &= ~MB_TYPE_P0L1;
1468         }else if(ref[0] < 0){
1469             *mb_type &= ~MB_TYPE_P0L0;
1470             sub_mb_type &= ~MB_TYPE_P0L0;
1471         }
1472
1473         if(IS_16X16(*mb_type)){
1474             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1475             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1476             if(!IS_INTRA(mb_type_col)
1477                && (   (l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1)
1478                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1
1479                        && (h->x264_build>33 || !h->x264_build)))){
1480                 if(ref[0] > 0)
1481                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1482                 else
1483                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1484                 if(ref[1] > 0)
1485                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1486                 else
1487                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1488             }else{
1489                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1490                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1491             }
1492         }else{
1493             for(i8=0; i8<4; i8++){
1494                 const int x8 = i8&1;
1495                 const int y8 = i8>>1;
1496
1497                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1498                     continue;
1499                 h->sub_mb_type[i8] = sub_mb_type;
1500
1501                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1502                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1503                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1504                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1505
1506                 /* col_zero_flag */
1507                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1508                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1509                                                   && (h->x264_build>33 || !h->x264_build)))){
1510                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1511                     if(IS_SUB_8X8(sub_mb_type)){
1512                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1513                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1514                             if(ref[0] == 0)
1515                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1516                             if(ref[1] == 0)
1517                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1518                         }
1519                     }else
1520                     for(i4=0; i4<4; i4++){
1521                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1522                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1523                             if(ref[0] == 0)
1524                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1525                             if(ref[1] == 0)
1526                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1527                         }
1528                     }
1529                 }
1530             }
1531         }
1532     }else{ /* direct temporal mv pred */
1533         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1534         const int *dist_scale_factor = h->dist_scale_factor;
1535
1536         if(FRAME_MBAFF){
1537             if(IS_INTERLACED(*mb_type)){
1538                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1539                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1540                 dist_scale_factor = h->dist_scale_factor_field;
1541             }
1542             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1543                 /* FIXME assumes direct_8x8_inference == 1 */
1544                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1545                 int mb_types_col[2];
1546                 int y_shift;
1547
1548                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1549                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1550                          | (*mb_type & MB_TYPE_INTERLACED);
1551                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1552
1553                 if(IS_INTERLACED(*mb_type)){
1554                     /* frame to field scaling */
1555                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1556                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1557                     if(s->mb_y&1){
1558                         l1ref0 -= 2*h->b8_stride;
1559                         l1ref1 -= 2*h->b8_stride;
1560                         l1mv0 -= 4*h->b_stride;
1561                         l1mv1 -= 4*h->b_stride;
1562                     }
1563                     y_shift = 0;
1564
1565                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1566                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1567                        && !is_b8x8)
1568                         *mb_type |= MB_TYPE_16x8;
1569                     else
1570                         *mb_type |= MB_TYPE_8x8;
1571                 }else{
1572                     /* field to frame scaling */
1573                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1574                      * but in MBAFF, top and bottom POC are equal */
1575                     int dy = (s->mb_y&1) ? 1 : 2;
1576                     mb_types_col[0] =
1577                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1578                     l1ref0 += dy*h->b8_stride;
1579                     l1ref1 += dy*h->b8_stride;
1580                     l1mv0 += 2*dy*h->b_stride;
1581                     l1mv1 += 2*dy*h->b_stride;
1582                     y_shift = 2;
1583
1584                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1585                        && !is_b8x8)
1586                         *mb_type |= MB_TYPE_16x16;
1587                     else
1588                         *mb_type |= MB_TYPE_8x8;
1589                 }
1590
1591                 for(i8=0; i8<4; i8++){
1592                     const int x8 = i8&1;
1593                     const int y8 = i8>>1;
1594                     int ref0, scale;
1595                     const int16_t (*l1mv)[2]= l1mv0;
1596
1597                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1598                         continue;
1599                     h->sub_mb_type[i8] = sub_mb_type;
1600
1601                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1602                     if(IS_INTRA(mb_types_col[y8])){
1603                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1604                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1605                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1606                         continue;
1607                     }
1608
1609                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1610                     if(ref0 >= 0)
1611                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1612                     else{
1613                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1614                         l1mv= l1mv1;
1615                     }
1616                     scale = dist_scale_factor[ref0];
1617                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1618
1619                     {
1620                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1621                         int my_col = (mv_col[1]<<y_shift)/2;
1622                         int mx = (scale * mv_col[0] + 128) >> 8;
1623                         int my = (scale * my_col + 128) >> 8;
1624                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1625                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1626                     }
1627                 }
1628                 return;
1629             }
1630         }
1631
1632         /* one-to-one mv scaling */
1633
1634         if(IS_16X16(*mb_type)){
1635             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1636             if(IS_INTRA(mb_type_col)){
1637                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1638                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1639                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1640             }else{
1641                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1642                                                 : map_col_to_list0[1][l1ref1[0]];
1643                 const int scale = dist_scale_factor[ref0];
1644                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1645                 int mv_l0[2];
1646                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1647                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1648                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1649                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1650                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1651             }
1652         }else{
1653             for(i8=0; i8<4; i8++){
1654                 const int x8 = i8&1;
1655                 const int y8 = i8>>1;
1656                 int ref0, scale;
1657                 const int16_t (*l1mv)[2]= l1mv0;
1658
1659                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1660                     continue;
1661                 h->sub_mb_type[i8] = sub_mb_type;
1662                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1663                 if(IS_INTRA(mb_type_col)){
1664                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1665                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1666                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1667                     continue;
1668                 }
1669
1670                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1671                 if(ref0 >= 0)
1672                     ref0 = map_col_to_list0[0][ref0];
1673                 else{
1674                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1675                     l1mv= l1mv1;
1676                 }
1677                 scale = dist_scale_factor[ref0];
1678
1679                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1680                 if(IS_SUB_8X8(sub_mb_type)){
1681                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1682                     int mx = (scale * mv_col[0] + 128) >> 8;
1683                     int my = (scale * mv_col[1] + 128) >> 8;
1684                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1685                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1686                 }else
1687                 for(i4=0; i4<4; i4++){
1688                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1689                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1690                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1691                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1692                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1693                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1694                 }
1695             }
1696         }
1697     }
1698 }
1699
1700 static inline void write_back_motion(H264Context *h, int mb_type){
1701     MpegEncContext * const s = &h->s;
1702     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1703     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1704     int list;
1705
1706     if(!USES_LIST(mb_type, 0))
1707         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1708
1709     for(list=0; list<2; list++){
1710         int y;
1711         if(!USES_LIST(mb_type, list))
1712             continue;
1713
1714         for(y=0; y<4; y++){
1715             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1716             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1717         }
1718         if( h->pps.cabac ) {
1719             if(IS_SKIP(mb_type))
1720                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1721             else
1722             for(y=0; y<4; y++){
1723                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1724                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1725             }
1726         }
1727
1728         {
1729             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1730             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1731             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1732             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1733             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1734         }
1735     }
1736
1737     if(h->slice_type == B_TYPE && h->pps.cabac){
1738         if(IS_8X8(mb_type)){
1739             uint8_t *direct_table = &h->direct_table[b8_xy];
1740             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1741             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1742             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1743         }
1744     }
1745 }
1746
1747 /**
1748  * Decodes a network abstraction layer unit.
1749  * @param consumed is the number of bytes used as input
1750  * @param length is the length of the array
1751  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1752  * @returns decoded bytes, might be src+1 if no escapes
1753  */
1754 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1755     int i, si, di;
1756     uint8_t *dst;
1757
1758 //    src[0]&0x80;                //forbidden bit
1759     h->nal_ref_idc= src[0]>>5;
1760     h->nal_unit_type= src[0]&0x1F;
1761
1762     src++; length--;
1763 #if 0
1764     for(i=0; i<length; i++)
1765         printf("%2X ", src[i]);
1766 #endif
1767     for(i=0; i+1<length; i+=2){
1768         if(src[i]) continue;
1769         if(i>0 && src[i-1]==0) i--;
1770         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1771             if(src[i+2]!=3){
1772                 /* startcode, so we must be past the end */
1773                 length=i;
1774             }
1775             break;
1776         }
1777     }
1778
1779     if(i>=length-1){ //no escaped 0
1780         *dst_length= length;
1781         *consumed= length+1; //+1 for the header
1782         return src;
1783     }
1784
1785     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1786     dst= h->rbsp_buffer;
1787
1788 //printf("decoding esc\n");
1789     si=di=0;
1790     while(si<length){
1791         //remove escapes (very rare 1:2^22)
1792         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1793             if(src[si+2]==3){ //escape
1794                 dst[di++]= 0;
1795                 dst[di++]= 0;
1796                 si+=3;
1797                 continue;
1798             }else //next start code
1799                 break;
1800         }
1801
1802         dst[di++]= src[si++];
1803     }
1804
1805     *dst_length= di;
1806     *consumed= si + 1;//+1 for the header
1807 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1808     return dst;
1809 }
1810
1811 #if 0
1812 /**
1813  * @param src the data which should be escaped
1814  * @param dst the target buffer, dst+1 == src is allowed as a special case
1815  * @param length the length of the src data
1816  * @param dst_length the length of the dst array
1817  * @returns length of escaped data in bytes or -1 if an error occured
1818  */
1819 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1820     int i, escape_count, si, di;
1821     uint8_t *temp;
1822
1823     assert(length>=0);
1824     assert(dst_length>0);
1825
1826     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1827
1828     if(length==0) return 1;
1829
1830     escape_count= 0;
1831     for(i=0; i<length; i+=2){
1832         if(src[i]) continue;
1833         if(i>0 && src[i-1]==0)
1834             i--;
1835         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1836             escape_count++;
1837             i+=2;
1838         }
1839     }
1840
1841     if(escape_count==0){
1842         if(dst+1 != src)
1843             memcpy(dst+1, src, length);
1844         return length + 1;
1845     }
1846
1847     if(length + escape_count + 1> dst_length)
1848         return -1;
1849
1850     //this should be damn rare (hopefully)
1851
1852     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1853     temp= h->rbsp_buffer;
1854 //printf("encoding esc\n");
1855
1856     si= 0;
1857     di= 0;
1858     while(si < length){
1859         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1860             temp[di++]= 0; si++;
1861             temp[di++]= 0; si++;
1862             temp[di++]= 3;
1863             temp[di++]= src[si++];
1864         }
1865         else
1866             temp[di++]= src[si++];
1867     }
1868     memcpy(dst+1, temp, length+escape_count);
1869
1870     assert(di == length+escape_count);
1871
1872     return di + 1;
1873 }
1874
1875 /**
1876  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1877  */
1878 static void encode_rbsp_trailing(PutBitContext *pb){
1879     int length;
1880     put_bits(pb, 1, 1);
1881     length= (-put_bits_count(pb))&7;
1882     if(length) put_bits(pb, length, 0);
1883 }
1884 #endif
1885
1886 /**
1887  * identifies the exact end of the bitstream
1888  * @return the length of the trailing, or 0 if damaged
1889  */
1890 static int decode_rbsp_trailing(uint8_t *src){
1891     int v= *src;
1892     int r;
1893
1894     tprintf("rbsp trailing %X\n", v);
1895
1896     for(r=1; r<9; r++){
1897         if(v&1) return r;
1898         v>>=1;
1899     }
1900     return 0;
1901 }
1902
1903 /**
1904  * idct tranforms the 16 dc values and dequantize them.
1905  * @param qp quantization parameter
1906  */
1907 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1908 #define stride 16
1909     int i;
1910     int temp[16]; //FIXME check if this is a good idea
1911     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1912     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1913
1914 //memset(block, 64, 2*256);
1915 //return;
1916     for(i=0; i<4; i++){
1917         const int offset= y_offset[i];
1918         const int z0= block[offset+stride*0] + block[offset+stride*4];
1919         const int z1= block[offset+stride*0] - block[offset+stride*4];
1920         const int z2= block[offset+stride*1] - block[offset+stride*5];
1921         const int z3= block[offset+stride*1] + block[offset+stride*5];
1922
1923         temp[4*i+0]= z0+z3;
1924         temp[4*i+1]= z1+z2;
1925         temp[4*i+2]= z1-z2;
1926         temp[4*i+3]= z0-z3;
1927     }
1928
1929     for(i=0; i<4; i++){
1930         const int offset= x_offset[i];
1931         const int z0= temp[4*0+i] + temp[4*2+i];
1932         const int z1= temp[4*0+i] - temp[4*2+i];
1933         const int z2= temp[4*1+i] - temp[4*3+i];
1934         const int z3= temp[4*1+i] + temp[4*3+i];
1935
1936         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1937         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1938         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1939         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1940     }
1941 }
1942
1943 #if 0
1944 /**
1945  * dct tranforms the 16 dc values.
1946  * @param qp quantization parameter ??? FIXME
1947  */
1948 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1949 //    const int qmul= dequant_coeff[qp][0];
1950     int i;
1951     int temp[16]; //FIXME check if this is a good idea
1952     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1953     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1954
1955     for(i=0; i<4; i++){
1956         const int offset= y_offset[i];
1957         const int z0= block[offset+stride*0] + block[offset+stride*4];
1958         const int z1= block[offset+stride*0] - block[offset+stride*4];
1959         const int z2= block[offset+stride*1] - block[offset+stride*5];
1960         const int z3= block[offset+stride*1] + block[offset+stride*5];
1961
1962         temp[4*i+0]= z0+z3;
1963         temp[4*i+1]= z1+z2;
1964         temp[4*i+2]= z1-z2;
1965         temp[4*i+3]= z0-z3;
1966     }
1967
1968     for(i=0; i<4; i++){
1969         const int offset= x_offset[i];
1970         const int z0= temp[4*0+i] + temp[4*2+i];
1971         const int z1= temp[4*0+i] - temp[4*2+i];
1972         const int z2= temp[4*1+i] - temp[4*3+i];
1973         const int z3= temp[4*1+i] + temp[4*3+i];
1974
1975         block[stride*0 +offset]= (z0 + z3)>>1;
1976         block[stride*2 +offset]= (z1 + z2)>>1;
1977         block[stride*8 +offset]= (z1 - z2)>>1;
1978         block[stride*10+offset]= (z0 - z3)>>1;
1979     }
1980 }
1981 #endif
1982
1983 #undef xStride
1984 #undef stride
1985
1986 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1987     const int stride= 16*2;
1988     const int xStride= 16;
1989     int a,b,c,d,e;
1990
1991     a= block[stride*0 + xStride*0];
1992     b= block[stride*0 + xStride*1];
1993     c= block[stride*1 + xStride*0];
1994     d= block[stride*1 + xStride*1];
1995
1996     e= a-b;
1997     a= a+b;
1998     b= c-d;
1999     c= c+d;
2000
2001     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
2002     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
2003     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
2004     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
2005 }
2006
2007 #if 0
2008 static void chroma_dc_dct_c(DCTELEM *block){
2009     const int stride= 16*2;
2010     const int xStride= 16;
2011     int a,b,c,d,e;
2012
2013     a= block[stride*0 + xStride*0];
2014     b= block[stride*0 + xStride*1];
2015     c= block[stride*1 + xStride*0];
2016     d= block[stride*1 + xStride*1];
2017
2018     e= a-b;
2019     a= a+b;
2020     b= c-d;
2021     c= c+d;
2022
2023     block[stride*0 + xStride*0]= (a+c);
2024     block[stride*0 + xStride*1]= (e+b);
2025     block[stride*1 + xStride*0]= (a-c);
2026     block[stride*1 + xStride*1]= (e-b);
2027 }
2028 #endif
2029
2030 /**
2031  * gets the chroma qp.
2032  */
2033 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
2034
2035     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
2036 }
2037
2038
2039 #if 0
2040 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
2041     int i;
2042     //FIXME try int temp instead of block
2043
2044     for(i=0; i<4; i++){
2045         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
2046         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
2047         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
2048         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
2049         const int z0= d0 + d3;
2050         const int z3= d0 - d3;
2051         const int z1= d1 + d2;
2052         const int z2= d1 - d2;
2053
2054         block[0 + 4*i]=   z0 +   z1;
2055         block[1 + 4*i]= 2*z3 +   z2;
2056         block[2 + 4*i]=   z0 -   z1;
2057         block[3 + 4*i]=   z3 - 2*z2;
2058     }
2059
2060     for(i=0; i<4; i++){
2061         const int z0= block[0*4 + i] + block[3*4 + i];
2062         const int z3= block[0*4 + i] - block[3*4 + i];
2063         const int z1= block[1*4 + i] + block[2*4 + i];
2064         const int z2= block[1*4 + i] - block[2*4 + i];
2065
2066         block[0*4 + i]=   z0 +   z1;
2067         block[1*4 + i]= 2*z3 +   z2;
2068         block[2*4 + i]=   z0 -   z1;
2069         block[3*4 + i]=   z3 - 2*z2;
2070     }
2071 }
2072 #endif
2073
2074 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
2075 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
2076 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
2077     int i;
2078     const int * const quant_table= quant_coeff[qscale];
2079     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
2080     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
2081     const unsigned int threshold2= (threshold1<<1);
2082     int last_non_zero;
2083
2084     if(seperate_dc){
2085         if(qscale<=18){
2086             //avoid overflows
2087             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
2088             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
2089             const unsigned int dc_threshold2= (dc_threshold1<<1);
2090
2091             int level= block[0]*quant_coeff[qscale+18][0];
2092             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2093                 if(level>0){
2094                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
2095                     block[0]= level;
2096                 }else{
2097                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
2098                     block[0]= -level;
2099                 }
2100 //                last_non_zero = i;
2101             }else{
2102                 block[0]=0;
2103             }
2104         }else{
2105             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
2106             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
2107             const unsigned int dc_threshold2= (dc_threshold1<<1);
2108
2109             int level= block[0]*quant_table[0];
2110             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2111                 if(level>0){
2112                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
2113                     block[0]= level;
2114                 }else{
2115                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
2116                     block[0]= -level;
2117                 }
2118 //                last_non_zero = i;
2119             }else{
2120                 block[0]=0;
2121             }
2122         }
2123         last_non_zero= 0;
2124         i=1;
2125     }else{
2126         last_non_zero= -1;
2127         i=0;
2128     }
2129
2130     for(; i<16; i++){
2131         const int j= scantable[i];
2132         int level= block[j]*quant_table[j];
2133
2134 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2135 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2136         if(((unsigned)(level+threshold1))>threshold2){
2137             if(level>0){
2138                 level= (bias + level)>>QUANT_SHIFT;
2139                 block[j]= level;
2140             }else{
2141                 level= (bias - level)>>QUANT_SHIFT;
2142                 block[j]= -level;
2143             }
2144             last_non_zero = i;
2145         }else{
2146             block[j]=0;
2147         }
2148     }
2149
2150     return last_non_zero;
2151 }
2152
2153 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2154     const uint32_t a= ((uint32_t*)(src-stride))[0];
2155     ((uint32_t*)(src+0*stride))[0]= a;
2156     ((uint32_t*)(src+1*stride))[0]= a;
2157     ((uint32_t*)(src+2*stride))[0]= a;
2158     ((uint32_t*)(src+3*stride))[0]= a;
2159 }
2160
2161 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2162     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2163     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2164     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2165     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2166 }
2167
2168 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2169     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2170                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2171
2172     ((uint32_t*)(src+0*stride))[0]=
2173     ((uint32_t*)(src+1*stride))[0]=
2174     ((uint32_t*)(src+2*stride))[0]=
2175     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2176 }
2177
2178 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2179     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2180
2181     ((uint32_t*)(src+0*stride))[0]=
2182     ((uint32_t*)(src+1*stride))[0]=
2183     ((uint32_t*)(src+2*stride))[0]=
2184     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2185 }
2186
2187 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2188     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2189
2190     ((uint32_t*)(src+0*stride))[0]=
2191     ((uint32_t*)(src+1*stride))[0]=
2192     ((uint32_t*)(src+2*stride))[0]=
2193     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2194 }
2195
2196 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2197     ((uint32_t*)(src+0*stride))[0]=
2198     ((uint32_t*)(src+1*stride))[0]=
2199     ((uint32_t*)(src+2*stride))[0]=
2200     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2201 }
2202
2203
2204 #define LOAD_TOP_RIGHT_EDGE\
2205     const int t4= topright[0];\
2206     const int t5= topright[1];\
2207     const int t6= topright[2];\
2208     const int t7= topright[3];\
2209
2210 #define LOAD_LEFT_EDGE\
2211     const int l0= src[-1+0*stride];\
2212     const int l1= src[-1+1*stride];\
2213     const int l2= src[-1+2*stride];\
2214     const int l3= src[-1+3*stride];\
2215
2216 #define LOAD_TOP_EDGE\
2217     const int t0= src[ 0-1*stride];\
2218     const int t1= src[ 1-1*stride];\
2219     const int t2= src[ 2-1*stride];\
2220     const int t3= src[ 3-1*stride];\
2221
2222 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2223     const int lt= src[-1-1*stride];
2224     LOAD_TOP_EDGE
2225     LOAD_LEFT_EDGE
2226
2227     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2228     src[0+2*stride]=
2229     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2230     src[0+1*stride]=
2231     src[1+2*stride]=
2232     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2233     src[0+0*stride]=
2234     src[1+1*stride]=
2235     src[2+2*stride]=
2236     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2237     src[1+0*stride]=
2238     src[2+1*stride]=
2239     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2240     src[2+0*stride]=
2241     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2242     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2243 }
2244
2245 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2246     LOAD_TOP_EDGE
2247     LOAD_TOP_RIGHT_EDGE
2248 //    LOAD_LEFT_EDGE
2249
2250     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2251     src[1+0*stride]=
2252     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2253     src[2+0*stride]=
2254     src[1+1*stride]=
2255     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2256     src[3+0*stride]=
2257     src[2+1*stride]=
2258     src[1+2*stride]=
2259     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2260     src[3+1*stride]=
2261     src[2+2*stride]=
2262     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2263     src[3+2*stride]=
2264     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2265     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2266 }
2267
2268 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2269     const int lt= src[-1-1*stride];
2270     LOAD_TOP_EDGE
2271     LOAD_LEFT_EDGE
2272     const __attribute__((unused)) int unu= l3;
2273
2274     src[0+0*stride]=
2275     src[1+2*stride]=(lt + t0 + 1)>>1;
2276     src[1+0*stride]=
2277     src[2+2*stride]=(t0 + t1 + 1)>>1;
2278     src[2+0*stride]=
2279     src[3+2*stride]=(t1 + t2 + 1)>>1;
2280     src[3+0*stride]=(t2 + t3 + 1)>>1;
2281     src[0+1*stride]=
2282     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2283     src[1+1*stride]=
2284     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2285     src[2+1*stride]=
2286     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2287     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2288     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2289     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2290 }
2291
2292 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2293     LOAD_TOP_EDGE
2294     LOAD_TOP_RIGHT_EDGE
2295     const __attribute__((unused)) int unu= t7;
2296
2297     src[0+0*stride]=(t0 + t1 + 1)>>1;
2298     src[1+0*stride]=
2299     src[0+2*stride]=(t1 + t2 + 1)>>1;
2300     src[2+0*stride]=
2301     src[1+2*stride]=(t2 + t3 + 1)>>1;
2302     src[3+0*stride]=
2303     src[2+2*stride]=(t3 + t4+ 1)>>1;
2304     src[3+2*stride]=(t4 + t5+ 1)>>1;
2305     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2306     src[1+1*stride]=
2307     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2308     src[2+1*stride]=
2309     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2310     src[3+1*stride]=
2311     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2312     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2313 }
2314
2315 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2316     LOAD_LEFT_EDGE
2317
2318     src[0+0*stride]=(l0 + l1 + 1)>>1;
2319     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2320     src[2+0*stride]=
2321     src[0+1*stride]=(l1 + l2 + 1)>>1;
2322     src[3+0*stride]=
2323     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2324     src[2+1*stride]=
2325     src[0+2*stride]=(l2 + l3 + 1)>>1;
2326     src[3+1*stride]=
2327     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2328     src[3+2*stride]=
2329     src[1+3*stride]=
2330     src[0+3*stride]=
2331     src[2+2*stride]=
2332     src[2+3*stride]=
2333     src[3+3*stride]=l3;
2334 }
2335
2336 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2337     const int lt= src[-1-1*stride];
2338     LOAD_TOP_EDGE
2339     LOAD_LEFT_EDGE
2340     const __attribute__((unused)) int unu= t3;
2341
2342     src[0+0*stride]=
2343     src[2+1*stride]=(lt + l0 + 1)>>1;
2344     src[1+0*stride]=
2345     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2346     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2347     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2348     src[0+1*stride]=
2349     src[2+2*stride]=(l0 + l1 + 1)>>1;
2350     src[1+1*stride]=
2351     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2352     src[0+2*stride]=
2353     src[2+3*stride]=(l1 + l2+ 1)>>1;
2354     src[1+2*stride]=
2355     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2356     src[0+3*stride]=(l2 + l3 + 1)>>1;
2357     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2358 }
2359
2360 static void pred16x16_vertical_c(uint8_t *src, int stride){
2361     int i;
2362     const uint32_t a= ((uint32_t*)(src-stride))[0];
2363     const uint32_t b= ((uint32_t*)(src-stride))[1];
2364     const uint32_t c= ((uint32_t*)(src-stride))[2];
2365     const uint32_t d= ((uint32_t*)(src-stride))[3];
2366
2367     for(i=0; i<16; i++){
2368         ((uint32_t*)(src+i*stride))[0]= a;
2369         ((uint32_t*)(src+i*stride))[1]= b;
2370         ((uint32_t*)(src+i*stride))[2]= c;
2371         ((uint32_t*)(src+i*stride))[3]= d;
2372     }
2373 }
2374
2375 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2376     int i;
2377
2378     for(i=0; i<16; i++){
2379         ((uint32_t*)(src+i*stride))[0]=
2380         ((uint32_t*)(src+i*stride))[1]=
2381         ((uint32_t*)(src+i*stride))[2]=
2382         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2383     }
2384 }
2385
2386 static void pred16x16_dc_c(uint8_t *src, int stride){
2387     int i, dc=0;
2388
2389     for(i=0;i<16; i++){
2390         dc+= src[-1+i*stride];
2391     }
2392
2393     for(i=0;i<16; i++){
2394         dc+= src[i-stride];
2395     }
2396
2397     dc= 0x01010101*((dc + 16)>>5);
2398
2399     for(i=0; i<16; i++){
2400         ((uint32_t*)(src+i*stride))[0]=
2401         ((uint32_t*)(src+i*stride))[1]=
2402         ((uint32_t*)(src+i*stride))[2]=
2403         ((uint32_t*)(src+i*stride))[3]= dc;
2404     }
2405 }
2406
2407 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2408     int i, dc=0;
2409
2410     for(i=0;i<16; i++){
2411         dc+= src[-1+i*stride];
2412     }
2413
2414     dc= 0x01010101*((dc + 8)>>4);
2415
2416     for(i=0; i<16; i++){
2417         ((uint32_t*)(src+i*stride))[0]=
2418         ((uint32_t*)(src+i*stride))[1]=
2419         ((uint32_t*)(src+i*stride))[2]=
2420         ((uint32_t*)(src+i*stride))[3]= dc;
2421     }
2422 }
2423
2424 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2425     int i, dc=0;
2426
2427     for(i=0;i<16; i++){
2428         dc+= src[i-stride];
2429     }
2430     dc= 0x01010101*((dc + 8)>>4);
2431
2432     for(i=0; i<16; i++){
2433         ((uint32_t*)(src+i*stride))[0]=
2434         ((uint32_t*)(src+i*stride))[1]=
2435         ((uint32_t*)(src+i*stride))[2]=
2436         ((uint32_t*)(src+i*stride))[3]= dc;
2437     }
2438 }
2439
2440 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2441     int i;
2442
2443     for(i=0; i<16; i++){
2444         ((uint32_t*)(src+i*stride))[0]=
2445         ((uint32_t*)(src+i*stride))[1]=
2446         ((uint32_t*)(src+i*stride))[2]=
2447         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2448     }
2449 }
2450
2451 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2452   int i, j, k;
2453   int a;
2454   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2455   const uint8_t * const src0 = src+7-stride;
2456   const uint8_t *src1 = src+8*stride-1;
2457   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2458   int H = src0[1] - src0[-1];
2459   int V = src1[0] - src2[ 0];
2460   for(k=2; k<=8; ++k) {
2461     src1 += stride; src2 -= stride;
2462     H += k*(src0[k] - src0[-k]);
2463     V += k*(src1[0] - src2[ 0]);
2464   }
2465   if(svq3){
2466     H = ( 5*(H/4) ) / 16;
2467     V = ( 5*(V/4) ) / 16;
2468
2469     /* required for 100% accuracy */
2470     i = H; H = V; V = i;
2471   }else{
2472     H = ( 5*H+32 ) >> 6;
2473     V = ( 5*V+32 ) >> 6;
2474   }
2475
2476   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2477   for(j=16; j>0; --j) {
2478     int b = a;
2479     a += V;
2480     for(i=-16; i<0; i+=4) {
2481       src[16+i] = cm[ (b    ) >> 5 ];
2482       src[17+i] = cm[ (b+  H) >> 5 ];
2483       src[18+i] = cm[ (b+2*H) >> 5 ];
2484       src[19+i] = cm[ (b+3*H) >> 5 ];
2485       b += 4*H;
2486     }
2487     src += stride;
2488   }
2489 }
2490
2491 static void pred16x16_plane_c(uint8_t *src, int stride){
2492     pred16x16_plane_compat_c(src, stride, 0);
2493 }
2494
2495 static void pred8x8_vertical_c(uint8_t *src, int stride){
2496     int i;
2497     const uint32_t a= ((uint32_t*)(src-stride))[0];
2498     const uint32_t b= ((uint32_t*)(src-stride))[1];
2499
2500     for(i=0; i<8; i++){
2501         ((uint32_t*)(src+i*stride))[0]= a;
2502         ((uint32_t*)(src+i*stride))[1]= b;
2503     }
2504 }
2505
2506 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2507     int i;
2508
2509     for(i=0; i<8; i++){
2510         ((uint32_t*)(src+i*stride))[0]=
2511         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2512     }
2513 }
2514
2515 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2516     int i;
2517
2518     for(i=0; i<8; i++){
2519         ((uint32_t*)(src+i*stride))[0]=
2520         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2521     }
2522 }
2523
2524 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2525     int i;
2526     int dc0, dc2;
2527
2528     dc0=dc2=0;
2529     for(i=0;i<4; i++){
2530         dc0+= src[-1+i*stride];
2531         dc2+= src[-1+(i+4)*stride];
2532     }
2533     dc0= 0x01010101*((dc0 + 2)>>2);
2534     dc2= 0x01010101*((dc2 + 2)>>2);
2535
2536     for(i=0; i<4; i++){
2537         ((uint32_t*)(src+i*stride))[0]=
2538         ((uint32_t*)(src+i*stride))[1]= dc0;
2539     }
2540     for(i=4; i<8; i++){
2541         ((uint32_t*)(src+i*stride))[0]=
2542         ((uint32_t*)(src+i*stride))[1]= dc2;
2543     }
2544 }
2545
2546 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2547     int i;
2548     int dc0, dc1;
2549
2550     dc0=dc1=0;
2551     for(i=0;i<4; i++){
2552         dc0+= src[i-stride];
2553         dc1+= src[4+i-stride];
2554     }
2555     dc0= 0x01010101*((dc0 + 2)>>2);
2556     dc1= 0x01010101*((dc1 + 2)>>2);
2557
2558     for(i=0; i<4; i++){
2559         ((uint32_t*)(src+i*stride))[0]= dc0;
2560         ((uint32_t*)(src+i*stride))[1]= dc1;
2561     }
2562     for(i=4; i<8; i++){
2563         ((uint32_t*)(src+i*stride))[0]= dc0;
2564         ((uint32_t*)(src+i*stride))[1]= dc1;
2565     }
2566 }
2567
2568
2569 static void pred8x8_dc_c(uint8_t *src, int stride){
2570     int i;
2571     int dc0, dc1, dc2, dc3;
2572
2573     dc0=dc1=dc2=0;
2574     for(i=0;i<4; i++){
2575         dc0+= src[-1+i*stride] + src[i-stride];
2576         dc1+= src[4+i-stride];
2577         dc2+= src[-1+(i+4)*stride];
2578     }
2579     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2580     dc0= 0x01010101*((dc0 + 4)>>3);
2581     dc1= 0x01010101*((dc1 + 2)>>2);
2582     dc2= 0x01010101*((dc2 + 2)>>2);
2583
2584     for(i=0; i<4; i++){
2585         ((uint32_t*)(src+i*stride))[0]= dc0;
2586         ((uint32_t*)(src+i*stride))[1]= dc1;
2587     }
2588     for(i=4; i<8; i++){
2589         ((uint32_t*)(src+i*stride))[0]= dc2;
2590         ((uint32_t*)(src+i*stride))[1]= dc3;
2591     }
2592 }
2593
2594 static void pred8x8_plane_c(uint8_t *src, int stride){
2595   int j, k;
2596   int a;
2597   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2598   const uint8_t * const src0 = src+3-stride;
2599   const uint8_t *src1 = src+4*stride-1;
2600   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2601   int H = src0[1] - src0[-1];
2602   int V = src1[0] - src2[ 0];
2603   for(k=2; k<=4; ++k) {
2604     src1 += stride; src2 -= stride;
2605     H += k*(src0[k] - src0[-k]);
2606     V += k*(src1[0] - src2[ 0]);
2607   }
2608   H = ( 17*H+16 ) >> 5;
2609   V = ( 17*V+16 ) >> 5;
2610
2611   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2612   for(j=8; j>0; --j) {
2613     int b = a;
2614     a += V;
2615     src[0] = cm[ (b    ) >> 5 ];
2616     src[1] = cm[ (b+  H) >> 5 ];
2617     src[2] = cm[ (b+2*H) >> 5 ];
2618     src[3] = cm[ (b+3*H) >> 5 ];
2619     src[4] = cm[ (b+4*H) >> 5 ];
2620     src[5] = cm[ (b+5*H) >> 5 ];
2621     src[6] = cm[ (b+6*H) >> 5 ];
2622     src[7] = cm[ (b+7*H) >> 5 ];
2623     src += stride;
2624   }
2625 }
2626
2627 #define SRC(x,y) src[(x)+(y)*stride]
2628 #define PL(y) \
2629     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2630 #define PREDICT_8x8_LOAD_LEFT \
2631     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2632                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2633     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2634     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2635
2636 #define PT(x) \
2637     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2638 #define PREDICT_8x8_LOAD_TOP \
2639     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2640                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2641     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2642     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2643                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2644
2645 #define PTR(x) \
2646     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2647 #define PREDICT_8x8_LOAD_TOPRIGHT \
2648     int t8, t9, t10, t11, t12, t13, t14, t15; \
2649     if(has_topright) { \
2650         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2651         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2652     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2653
2654 #define PREDICT_8x8_LOAD_TOPLEFT \
2655     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2656
2657 #define PREDICT_8x8_DC(v) \
2658     int y; \
2659     for( y = 0; y < 8; y++ ) { \
2660         ((uint32_t*)src)[0] = \
2661         ((uint32_t*)src)[1] = v; \
2662         src += stride; \
2663     }
2664
2665 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2666 {
2667     PREDICT_8x8_DC(0x80808080);
2668 }
2669 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2670 {
2671     PREDICT_8x8_LOAD_LEFT;
2672     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2673     PREDICT_8x8_DC(dc);
2674 }
2675 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2676 {
2677     PREDICT_8x8_LOAD_TOP;
2678     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2679     PREDICT_8x8_DC(dc);
2680 }
2681 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2682 {
2683     PREDICT_8x8_LOAD_LEFT;
2684     PREDICT_8x8_LOAD_TOP;
2685     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2686                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2687     PREDICT_8x8_DC(dc);
2688 }
2689 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2690 {
2691     PREDICT_8x8_LOAD_LEFT;
2692 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2693                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2694     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2695 #undef ROW
2696 }
2697 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2698 {
2699     int y;
2700     PREDICT_8x8_LOAD_TOP;
2701     src[0] = t0;
2702     src[1] = t1;
2703     src[2] = t2;
2704     src[3] = t3;
2705     src[4] = t4;
2706     src[5] = t5;
2707     src[6] = t6;
2708     src[7] = t7;
2709     for( y = 1; y < 8; y++ )
2710         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2711 }
2712 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2713 {
2714     PREDICT_8x8_LOAD_TOP;
2715     PREDICT_8x8_LOAD_TOPRIGHT;
2716     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2717     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2718     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2719     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2720     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2721     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2722     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2723     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2724     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2725     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2726     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2727     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2728     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2729     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2730     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2731 }
2732 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2733 {
2734     PREDICT_8x8_LOAD_TOP;
2735     PREDICT_8x8_LOAD_LEFT;
2736     PREDICT_8x8_LOAD_TOPLEFT;
2737     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2738     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2739     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2740     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2741     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2742     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2743     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2744     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2745     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2746     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2747     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2748     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2749     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2750     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2751     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2752
2753 }
2754 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2755 {
2756     PREDICT_8x8_LOAD_TOP;
2757     PREDICT_8x8_LOAD_LEFT;
2758     PREDICT_8x8_LOAD_TOPLEFT;
2759     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2760     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2761     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2762     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2763     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2764     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2765     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2766     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2767     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2768     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2769     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2770     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2771     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2772     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2773     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2774     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2775     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2776     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2777     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2778     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2779     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2780     SRC(7,0)= (t6 + t7 + 1) >> 1;
2781 }
2782 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2783 {
2784     PREDICT_8x8_LOAD_TOP;
2785     PREDICT_8x8_LOAD_LEFT;
2786     PREDICT_8x8_LOAD_TOPLEFT;
2787     SRC(0,7)= (l6 + l7 + 1) >> 1;
2788     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2789     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2790     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2791     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2792     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2793     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2794     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2795     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2796     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2797     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2798     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2799     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2800     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2801     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2802     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2803     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2804     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2805     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2806     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2807     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2808     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2809 }
2810 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2811 {
2812     PREDICT_8x8_LOAD_TOP;
2813     PREDICT_8x8_LOAD_TOPRIGHT;
2814     SRC(0,0)= (t0 + t1 + 1) >> 1;
2815     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2816     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2817     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2818     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2819     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2820     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2821     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2822     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2823     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2824     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2825     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2826     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2827     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2828     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2829     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2830     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2831     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2832     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2833     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2834     SRC(7,6)= (t10 + t11 + 1) >> 1;
2835     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2836 }
2837 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2838 {
2839     PREDICT_8x8_LOAD_LEFT;
2840     SRC(0,0)= (l0 + l1 + 1) >> 1;
2841     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2842     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2843     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2844     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2845     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2846     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2847     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2848     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2849     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2850     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2851     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2852     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2853     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2854     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2855     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2856     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2857     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2858 }
2859 #undef PREDICT_8x8_LOAD_LEFT
2860 #undef PREDICT_8x8_LOAD_TOP
2861 #undef PREDICT_8x8_LOAD_TOPLEFT
2862 #undef PREDICT_8x8_LOAD_TOPRIGHT
2863 #undef PREDICT_8x8_DC
2864 #undef PTR
2865 #undef PT
2866 #undef PL
2867 #undef SRC
2868
2869 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2870                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2871                            int src_x_offset, int src_y_offset,
2872                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2873     MpegEncContext * const s = &h->s;
2874     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2875     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2876     const int luma_xy= (mx&3) + ((my&3)<<2);
2877     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2878     uint8_t * src_cb, * src_cr;
2879     int extra_width= h->emu_edge_width;
2880     int extra_height= h->emu_edge_height;
2881     int emu=0;
2882     const int full_mx= mx>>2;
2883     const int full_my= my>>2;
2884     const int pic_width  = 16*s->mb_width;
2885     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2886
2887     if(!pic->data[0])
2888         return;
2889
2890     if(mx&7) extra_width -= 3;
2891     if(my&7) extra_height -= 3;
2892
2893     if(   full_mx < 0-extra_width
2894        || full_my < 0-extra_height
2895        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2896        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2897         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2898             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2899         emu=1;
2900     }
2901
2902     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2903     if(!square){
2904         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2905     }
2906
2907     if(s->flags&CODEC_FLAG_GRAY) return;
2908
2909     if(MB_MBAFF){
2910         // chroma offset when predicting from a field of opposite parity
2911         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2912         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2913     }
2914     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2915     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2916
2917     if(emu){
2918         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2919             src_cb= s->edge_emu_buffer;
2920     }
2921     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2922
2923     if(emu){
2924         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2925             src_cr= s->edge_emu_buffer;
2926     }
2927     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2928 }
2929
2930 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2931                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2932                            int x_offset, int y_offset,
2933                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2934                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2935                            int list0, int list1){
2936     MpegEncContext * const s = &h->s;
2937     qpel_mc_func *qpix_op=  qpix_put;
2938     h264_chroma_mc_func chroma_op= chroma_put;
2939
2940     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2941     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2942     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2943     x_offset += 8*s->mb_x;
2944     y_offset += 8*(s->mb_y >> MB_MBAFF);
2945
2946     if(list0){
2947         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2948         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2949                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2950                            qpix_op, chroma_op);
2951
2952         qpix_op=  qpix_avg;
2953         chroma_op= chroma_avg;
2954     }
2955
2956     if(list1){
2957         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2958         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2959                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2960                            qpix_op, chroma_op);
2961     }
2962 }
2963
2964 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2965                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2966                            int x_offset, int y_offset,
2967                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2968                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2969                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2970                            int list0, int list1){
2971     MpegEncContext * const s = &h->s;
2972
2973     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2974     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2975     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2976     x_offset += 8*s->mb_x;
2977     y_offset += 8*(s->mb_y >> MB_MBAFF);
2978
2979     if(list0 && list1){
2980         /* don't optimize for luma-only case, since B-frames usually
2981          * use implicit weights => chroma too. */
2982         uint8_t *tmp_cb = s->obmc_scratchpad;
2983         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2984         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2985         int refn0 = h->ref_cache[0][ scan8[n] ];
2986         int refn1 = h->ref_cache[1][ scan8[n] ];
2987
2988         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2989                     dest_y, dest_cb, dest_cr,
2990                     x_offset, y_offset, qpix_put, chroma_put);
2991         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2992                     tmp_y, tmp_cb, tmp_cr,
2993                     x_offset, y_offset, qpix_put, chroma_put);
2994
2995         if(h->use_weight == 2){
2996             int weight0 = h->implicit_weight[refn0][refn1];
2997             int weight1 = 64 - weight0;
2998             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2999             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
3000             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
3001         }else{
3002             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
3003                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
3004                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
3005             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3006                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
3007                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
3008             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3009                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
3010                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
3011         }
3012     }else{
3013         int list = list1 ? 1 : 0;
3014         int refn = h->ref_cache[list][ scan8[n] ];
3015         Picture *ref= &h->ref_list[list][refn];
3016         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
3017                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
3018                     qpix_put, chroma_put);
3019
3020         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
3021                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
3022         if(h->use_weight_chroma){
3023             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3024                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
3025             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3026                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
3027         }
3028     }
3029 }
3030
3031 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
3032                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3033                            int x_offset, int y_offset,
3034                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
3035                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
3036                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
3037                            int list0, int list1){
3038     if((h->use_weight==2 && list0 && list1
3039         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
3040        || h->use_weight==1)
3041         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3042                          x_offset, y_offset, qpix_put, chroma_put,
3043                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
3044     else
3045         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3046                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
3047 }
3048
3049 static inline void prefetch_motion(H264Context *h, int list){
3050     /* fetch pixels for estimated mv 4 macroblocks ahead
3051      * optimized for 64byte cache lines */
3052     MpegEncContext * const s = &h->s;
3053     const int refn = h->ref_cache[list][scan8[0]];
3054     if(refn >= 0){
3055         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
3056         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
3057         uint8_t **src= h->ref_list[list][refn].data;
3058         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
3059         s->dsp.prefetch(src[0]+off, s->linesize, 4);
3060         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
3061         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
3062     }
3063 }
3064
3065 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3066                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
3067                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
3068                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
3069     MpegEncContext * const s = &h->s;
3070     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3071     const int mb_type= s->current_picture.mb_type[mb_xy];
3072
3073     assert(IS_INTER(mb_type));
3074
3075     prefetch_motion(h, 0);
3076
3077     if(IS_16X16(mb_type)){
3078         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
3079                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
3080                 &weight_op[0], &weight_avg[0],
3081                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3082     }else if(IS_16X8(mb_type)){
3083         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
3084                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3085                 &weight_op[1], &weight_avg[1],
3086                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3087         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
3088                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3089                 &weight_op[1], &weight_avg[1],
3090                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3091     }else if(IS_8X16(mb_type)){
3092         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
3093                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3094                 &weight_op[2], &weight_avg[2],
3095                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3096         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
3097                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3098                 &weight_op[2], &weight_avg[2],
3099                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3100     }else{
3101         int i;
3102
3103         assert(IS_8X8(mb_type));
3104
3105         for(i=0; i<4; i++){
3106             const int sub_mb_type= h->sub_mb_type[i];
3107             const int n= 4*i;
3108             int x_offset= (i&1)<<2;
3109             int y_offset= (i&2)<<1;
3110
3111             if(IS_SUB_8X8(sub_mb_type)){
3112                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3113                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3114                     &weight_op[3], &weight_avg[3],
3115                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3116             }else if(IS_SUB_8X4(sub_mb_type)){
3117                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3118                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3119                     &weight_op[4], &weight_avg[4],
3120                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3121                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3122                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3123                     &weight_op[4], &weight_avg[4],
3124                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3125             }else if(IS_SUB_4X8(sub_mb_type)){
3126                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3127                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3128                     &weight_op[5], &weight_avg[5],
3129                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3130                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3131                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3132                     &weight_op[5], &weight_avg[5],
3133                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3134             }else{
3135                 int j;
3136                 assert(IS_SUB_4X4(sub_mb_type));
3137                 for(j=0; j<4; j++){
3138                     int sub_x_offset= x_offset + 2*(j&1);
3139                     int sub_y_offset= y_offset +   (j&2);
3140                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3141                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3142                         &weight_op[6], &weight_avg[6],
3143                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3144                 }
3145             }
3146         }
3147     }
3148
3149     prefetch_motion(h, 1);
3150 }
3151
3152 static void decode_init_vlc(H264Context *h){
3153     static int done = 0;
3154
3155     if (!done) {
3156         int i;
3157         done = 1;
3158
3159         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3160                  &chroma_dc_coeff_token_len [0], 1, 1,
3161                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3162
3163         for(i=0; i<4; i++){
3164             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3165                      &coeff_token_len [i][0], 1, 1,
3166                      &coeff_token_bits[i][0], 1, 1, 1);
3167         }
3168
3169         for(i=0; i<3; i++){
3170             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3171                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3172                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3173         }
3174         for(i=0; i<15; i++){
3175             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3176                      &total_zeros_len [i][0], 1, 1,
3177                      &total_zeros_bits[i][0], 1, 1, 1);
3178         }
3179
3180         for(i=0; i<6; i++){
3181             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3182                      &run_len [i][0], 1, 1,
3183                      &run_bits[i][0], 1, 1, 1);
3184         }
3185         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3186                  &run_len [6][0], 1, 1,
3187                  &run_bits[6][0], 1, 1, 1);
3188     }
3189 }
3190
3191 /**
3192  * Sets the intra prediction function pointers.
3193  */
3194 static void init_pred_ptrs(H264Context *h){
3195 //    MpegEncContext * const s = &h->s;
3196
3197     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3198     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3199     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3200     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3201     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3202     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3203     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3204     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3205     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3206     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3207     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3208     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3209
3210     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3211     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3212     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3213     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3214     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3215     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3216     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3217     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3218     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3219     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3220     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3221     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3222
3223     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
3224     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
3225     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
3226     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
3227     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3228     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3229     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
3230
3231     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
3232     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
3233     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
3234     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
3235     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3236     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3237     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
3238 }
3239
3240 static void free_tables(H264Context *h){
3241     av_freep(&h->intra4x4_pred_mode);
3242     av_freep(&h->chroma_pred_mode_table);
3243     av_freep(&h->cbp_table);
3244     av_freep(&h->mvd_table[0]);
3245     av_freep(&h->mvd_table[1]);
3246     av_freep(&h->direct_table);
3247     av_freep(&h->non_zero_count);
3248     av_freep(&h->slice_table_base);
3249     av_freep(&h->top_borders[1]);
3250     av_freep(&h->top_borders[0]);
3251     h->slice_table= NULL;
3252
3253     av_freep(&h->mb2b_xy);
3254     av_freep(&h->mb2b8_xy);
3255
3256     av_freep(&h->s.obmc_scratchpad);
3257 }
3258
3259 static void init_dequant8_coeff_table(H264Context *h){
3260     int i,q,x;
3261     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3262     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3263     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3264
3265     for(i=0; i<2; i++ ){
3266         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3267             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3268             break;
3269         }
3270
3271         for(q=0; q<52; q++){
3272             int shift = div6[q];
3273             int idx = rem6[q];
3274             for(x=0; x<64; x++)
3275                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3276                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3277                     h->pps.scaling_matrix8[i][x]) << shift;
3278         }
3279     }
3280 }
3281
3282 static void init_dequant4_coeff_table(H264Context *h){
3283     int i,j,q,x;
3284     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3285     for(i=0; i<6; i++ ){
3286         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3287         for(j=0; j<i; j++){
3288             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3289                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3290                 break;
3291             }
3292         }
3293         if(j<i)
3294             continue;
3295
3296         for(q=0; q<52; q++){
3297             int shift = div6[q] + 2;
3298             int idx = rem6[q];
3299             for(x=0; x<16; x++)
3300                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3301                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3302                     h->pps.scaling_matrix4[i][x]) << shift;
3303         }
3304     }
3305 }
3306
3307 static void init_dequant_tables(H264Context *h){
3308     int i,x;
3309     init_dequant4_coeff_table(h);
3310     if(h->pps.transform_8x8_mode)
3311         init_dequant8_coeff_table(h);
3312     if(h->sps.transform_bypass){
3313         for(i=0; i<6; i++)
3314             for(x=0; x<16; x++)
3315                 h->dequant4_coeff[i][0][x] = 1<<6;
3316         if(h->pps.transform_8x8_mode)
3317             for(i=0; i<2; i++)
3318                 for(x=0; x<64; x++)
3319                     h->dequant8_coeff[i][0][x] = 1<<6;
3320     }
3321 }
3322
3323
3324 /**
3325  * allocates tables.
3326  * needs width/height
3327  */
3328 static int alloc_tables(H264Context *h){
3329     MpegEncContext * const s = &h->s;
3330     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3331     int x,y;
3332
3333     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3334
3335     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3336     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3337     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3338     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3339     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3340
3341     if( h->pps.cabac ) {
3342         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3343         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3344         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3345         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3346     }
3347
3348     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3349     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3350
3351     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3352     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3353     for(y=0; y<s->mb_height; y++){
3354         for(x=0; x<s->mb_width; x++){
3355             const int mb_xy= x + y*s->mb_stride;
3356             const int b_xy = 4*x + 4*y*h->b_stride;
3357             const int b8_xy= 2*x + 2*y*h->b8_stride;
3358
3359             h->mb2b_xy [mb_xy]= b_xy;
3360             h->mb2b8_xy[mb_xy]= b8_xy;
3361         }
3362     }
3363
3364     s->obmc_scratchpad = NULL;
3365
3366     if(!h->dequant4_coeff[0])
3367         init_dequant_tables(h);
3368
3369     return 0;
3370 fail:
3371     free_tables(h);
3372     return -1;
3373 }
3374
3375 static void common_init(H264Context *h){
3376     MpegEncContext * const s = &h->s;
3377
3378     s->width = s->avctx->width;
3379     s->height = s->avctx->height;
3380     s->codec_id= s->avctx->codec->id;
3381
3382     init_pred_ptrs(h);
3383
3384     h->dequant_coeff_pps= -1;
3385     s->unrestricted_mv=1;
3386     s->decode=1; //FIXME
3387
3388     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3389     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3390 }
3391
3392 static int decode_init(AVCodecContext *avctx){
3393     H264Context *h= avctx->priv_data;
3394     MpegEncContext * const s = &h->s;
3395
3396     MPV_decode_defaults(s);
3397
3398     s->avctx = avctx;
3399     common_init(h);
3400
3401     s->out_format = FMT_H264;
3402     s->workaround_bugs= avctx->workaround_bugs;
3403
3404     // set defaults
3405 //    s->decode_mb= ff_h263_decode_mb;
3406     s->low_delay= 1;
3407     avctx->pix_fmt= PIX_FMT_YUV420P;
3408
3409     decode_init_vlc(h);
3410
3411     if(avctx->extradata_size > 0 && avctx->extradata &&
3412        *(char *)avctx->extradata == 1){
3413         h->is_avc = 1;
3414         h->got_avcC = 0;
3415     } else {
3416         h->is_avc = 0;
3417     }
3418
3419     return 0;
3420 }
3421
3422 static int frame_start(H264Context *h){
3423     MpegEncContext * const s = &h->s;
3424     int i;
3425
3426     if(MPV_frame_start(s, s->avctx) < 0)
3427         return -1;
3428     ff_er_frame_start(s);
3429
3430     assert(s->linesize && s->uvlinesize);
3431
3432     for(i=0; i<16; i++){
3433         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3434         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3435     }
3436     for(i=0; i<4; i++){
3437         h->block_offset[16+i]=
3438         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3439         h->block_offset[24+16+i]=
3440         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3441     }
3442
3443     /* can't be in alloc_tables because linesize isn't known there.
3444      * FIXME: redo bipred weight to not require extra buffer? */
3445     if(!s->obmc_scratchpad)
3446         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3447
3448     /* some macroblocks will be accessed before they're available */
3449     if(FRAME_MBAFF)
3450         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3451
3452 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3453     return 0;
3454 }
3455
3456 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3457     MpegEncContext * const s = &h->s;
3458     int i;
3459
3460     src_y  -=   linesize;
3461     src_cb -= uvlinesize;
3462     src_cr -= uvlinesize;
3463
3464     // There are two lines saved, the line above the the top macroblock of a pair,
3465     // and the line above the bottom macroblock
3466     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3467     for(i=1; i<17; i++){
3468         h->left_border[i]= src_y[15+i*  linesize];
3469     }
3470
3471     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3472     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3473
3474     if(!(s->flags&CODEC_FLAG_GRAY)){
3475         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3476         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3477         for(i=1; i<9; i++){
3478             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3479             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3480         }
3481         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3482         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3483     }
3484 }
3485
3486 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3487     MpegEncContext * const s = &h->s;
3488     int temp8, i;
3489     uint64_t temp64;
3490     int deblock_left = (s->mb_x > 0);
3491     int deblock_top  = (s->mb_y > 0);
3492
3493     src_y  -=   linesize + 1;
3494     src_cb -= uvlinesize + 1;
3495     src_cr -= uvlinesize + 1;
3496
3497 #define XCHG(a,b,t,xchg)\
3498 t= a;\
3499 if(xchg)\
3500     a= b;\
3501 b= t;
3502
3503     if(deblock_left){
3504         for(i = !deblock_top; i<17; i++){
3505             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3506         }
3507     }
3508
3509     if(deblock_top){
3510         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3511         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3512         if(s->mb_x+1 < s->mb_width){
3513             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3514         }
3515     }
3516
3517     if(!(s->flags&CODEC_FLAG_GRAY)){
3518         if(deblock_left){
3519             for(i = !deblock_top; i<9; i++){
3520                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3521                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3522             }
3523         }
3524         if(deblock_top){
3525             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3526             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3527         }
3528     }
3529 }
3530
3531 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3532     MpegEncContext * const s = &h->s;
3533     int i;
3534
3535     src_y  -= 2 *   linesize;
3536     src_cb -= 2 * uvlinesize;
3537     src_cr -= 2 * uvlinesize;
3538
3539     // There are two lines saved, the line above the the top macroblock of a pair,
3540     // and the line above the bottom macroblock
3541     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3542     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3543     for(i=2; i<34; i++){
3544         h->left_border[i]= src_y[15+i*  linesize];
3545     }
3546
3547     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3548     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3549     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3550     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3551
3552     if(!(s->flags&CODEC_FLAG_GRAY)){
3553         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3554         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3555         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3556         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3557         for(i=2; i<18; i++){
3558             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3559             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3560         }
3561         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3562         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3563         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3564         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3565     }
3566 }
3567
3568 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3569     MpegEncContext * const s = &h->s;
3570     int temp8, i;
3571     uint64_t temp64;
3572     int deblock_left = (s->mb_x > 0);
3573     int deblock_top  = (s->mb_y > 1);
3574
3575     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3576
3577     src_y  -= 2 *   linesize + 1;
3578     src_cb -= 2 * uvlinesize + 1;
3579     src_cr -= 2 * uvlinesize + 1;
3580
3581 #define XCHG(a,b,t,xchg)\
3582 t= a;\
3583 if(xchg)\
3584     a= b;\
3585 b= t;
3586
3587     if(deblock_left){
3588         for(i = (!deblock_top)<<1; i<34; i++){
3589             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3590         }
3591     }
3592
3593     if(deblock_top){
3594         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3595         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3596         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3597         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3598         if(s->mb_x+1 < s->mb_width){
3599             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3600             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3601         }
3602     }
3603
3604     if(!(s->flags&CODEC_FLAG_GRAY)){
3605         if(deblock_left){
3606             for(i = (!deblock_top) << 1; i<18; i++){
3607                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3608                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3609             }
3610         }
3611         if(deblock_top){
3612             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3613             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3614             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3615             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3616         }
3617     }
3618 }
3619
3620 static void hl_decode_mb(H264Context *h){
3621     MpegEncContext * const s = &h->s;
3622     const int mb_x= s->mb_x;
3623     const int mb_y= s->mb_y;
3624     const int mb_xy= mb_x + mb_y*s->mb_stride;
3625     const int mb_type= s->current_picture.mb_type[mb_xy];
3626     uint8_t  *dest_y, *dest_cb, *dest_cr;
3627     int linesize, uvlinesize /*dct_offset*/;
3628     int i;
3629     int *block_offset = &h->block_offset[0];
3630     const unsigned int bottom = mb_y & 1;
3631     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3632     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3633     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3634
3635     if(!s->decode)
3636         return;
3637
3638     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3639     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3640     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3641
3642     if (MB_FIELD) {
3643         linesize   = h->mb_linesize   = s->linesize * 2;
3644         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3645         block_offset = &h->block_offset[24];
3646         if(mb_y&1){ //FIXME move out of this func?
3647             dest_y -= s->linesize*15;
3648             dest_cb-= s->uvlinesize*7;
3649             dest_cr-= s->uvlinesize*7;
3650         }
3651         if(FRAME_MBAFF) {
3652             int list;
3653             for(list=0; list<2; list++){
3654                 if(!USES_LIST(mb_type, list))
3655                     continue;
3656                 if(IS_16X16(mb_type)){
3657                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3658                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3659                 }else{
3660                     for(i=0; i<16; i+=4){
3661                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3662                         int ref = h->ref_cache[list][scan8[i]];
3663                         if(ref >= 0)
3664                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3665                     }
3666                 }
3667             }
3668         }
3669     } else {
3670         linesize   = h->mb_linesize   = s->linesize;
3671         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3672 //        dct_offset = s->linesize * 16;
3673     }
3674
3675     if(transform_bypass){
3676         idct_dc_add =
3677         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3678     }else if(IS_8x8DCT(mb_type)){
3679         idct_dc_add = s->dsp.h264_idct8_dc_add;
3680         idct_add = s->dsp.h264_idct8_add;
3681     }else{
3682         idct_dc_add = s->dsp.h264_idct_dc_add;
3683         idct_add = s->dsp.h264_idct_add;
3684     }
3685
3686     if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3687        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3688         int mbt_y = mb_y&~1;
3689         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3690         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3691         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3692         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3693     }
3694
3695     if (IS_INTRA_PCM(mb_type)) {
3696         unsigned int x, y;
3697
3698         // The pixels are stored in h->mb array in the same order as levels,
3699         // copy them in output in the correct order.
3700         for(i=0; i<16; i++) {
3701             for (y=0; y<4; y++) {
3702                 for (x=0; x<4; x++) {
3703                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3704                 }
3705             }
3706         }
3707         for(i=16; i<16+4; i++) {
3708             for (y=0; y<4; y++) {
3709                 for (x=0; x<4; x++) {
3710                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3711                 }
3712             }
3713         }
3714         for(i=20; i<20+4; i++) {
3715             for (y=0; y<4; y++) {
3716                 for (x=0; x<4; x++) {
3717                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3718                 }
3719             }
3720         }
3721     } else {
3722         if(IS_INTRA(mb_type)){
3723             if(h->deblocking_filter && !FRAME_MBAFF)
3724                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3725
3726             if(!(s->flags&CODEC_FLAG_GRAY)){
3727                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3728                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3729             }
3730
3731             if(IS_INTRA4x4(mb_type)){
3732                 if(!s->encoding){
3733                     if(IS_8x8DCT(mb_type)){
3734                         for(i=0; i<16; i+=4){
3735                             uint8_t * const ptr= dest_y + block_offset[i];
3736                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3737                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3738                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3739                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3740                             if(nnz){
3741                                 if(nnz == 1 && h->mb[i*16])
3742                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3743                                 else
3744                                     idct_add(ptr, h->mb + i*16, linesize);
3745                             }
3746                         }
3747                     }else
3748                     for(i=0; i<16; i++){
3749                         uint8_t * const ptr= dest_y + block_offset[i];
3750                         uint8_t *topright;
3751                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3752                         int nnz, tr;
3753
3754                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3755                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3756                             assert(mb_y || linesize <= block_offset[i]);
3757                             if(!topright_avail){
3758                                 tr= ptr[3 - linesize]*0x01010101;
3759                                 topright= (uint8_t*) &tr;
3760                             }else
3761                                 topright= ptr + 4 - linesize;
3762                         }else
3763                             topright= NULL;
3764
3765                         h->pred4x4[ dir ](ptr, topright, linesize);
3766                         nnz = h->non_zero_count_cache[ scan8[i] ];
3767                         if(nnz){
3768                             if(s->codec_id == CODEC_ID_H264){
3769                                 if(nnz == 1 && h->mb[i*16])
3770                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3771                                 else
3772                                     idct_add(ptr, h->mb + i*16, linesize);
3773                             }else
3774                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3775                         }
3776                     }
3777                 }
3778             }else{
3779                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3780                 if(s->codec_id == CODEC_ID_H264){
3781                     if(!transform_bypass)
3782                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3783                 }else
3784                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3785             }
3786             if(h->deblocking_filter && !FRAME_MBAFF)
3787                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3788         }else if(s->codec_id == CODEC_ID_H264){
3789             hl_motion(h, dest_y, dest_cb, dest_cr,
3790                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3791                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3792                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3793         }
3794
3795
3796         if(!IS_INTRA4x4(mb_type)){
3797             if(s->codec_id == CODEC_ID_H264){
3798                 if(IS_INTRA16x16(mb_type)){
3799                     for(i=0; i<16; i++){
3800                         if(h->non_zero_count_cache[ scan8[i] ])
3801                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3802                         else if(h->mb[i*16])
3803                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3804                     }
3805                 }else{
3806                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3807                     for(i=0; i<16; i+=di){
3808                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3809                         if(nnz){
3810                             if(nnz==1 && h->mb[i*16])
3811                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3812                             else
3813                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3814                         }
3815                     }
3816                 }
3817             }else{
3818                 for(i=0; i<16; i++){
3819                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3820                         uint8_t * const ptr= dest_y + block_offset[i];
3821                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3822                     }
3823                 }
3824             }
3825         }
3826
3827         if(!(s->flags&CODEC_FLAG_GRAY)){
3828             uint8_t *dest[2] = {dest_cb, dest_cr};
3829             if(transform_bypass){
3830                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3831             }else{
3832                 idct_add = s->dsp.h264_idct_add;
3833                 idct_dc_add = s->dsp.h264_idct_dc_add;
3834                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3835                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3836             }
3837             if(s->codec_id == CODEC_ID_H264){
3838                 for(i=16; i<16+8; i++){
3839                     if(h->non_zero_count_cache[ scan8[i] ])
3840                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3841                     else if(h->mb[i*16])
3842                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3843                 }
3844             }else{
3845                 for(i=16; i<16+8; i++){
3846                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3847                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3848                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3849                     }
3850                 }
3851             }
3852         }
3853     }
3854     if(h->deblocking_filter) {
3855         if (FRAME_MBAFF) {
3856             //FIXME try deblocking one mb at a time?
3857             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3858             const int mb_y = s->mb_y - 1;
3859             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3860             const int mb_xy= mb_x + mb_y*s->mb_stride;
3861             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3862             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3863             if (!bottom) return;
3864             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3865             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3866             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3867
3868             if(IS_INTRA(mb_type_top | mb_type_bottom))
3869                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3870
3871             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3872             // deblock a pair
3873             // top
3874             s->mb_y--;
3875             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3876             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3877             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3878             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3879             // bottom
3880             s->mb_y++;
3881             tprintf("call mbaff filter_mb\n");
3882             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3883             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3884             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3885         } else {
3886             tprintf("call filter_mb\n");
3887             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3888             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3889             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3890         }
3891     }
3892 }
3893
3894 /**
3895  * fills the default_ref_list.
3896  */
3897 static int fill_default_ref_list(H264Context *h){
3898     MpegEncContext * const s = &h->s;
3899     int i;
3900     int smallest_poc_greater_than_current = -1;
3901     Picture sorted_short_ref[32];
3902
3903     if(h->slice_type==B_TYPE){
3904         int out_i;
3905         int limit= INT_MIN;
3906
3907         /* sort frame according to poc in B slice */
3908         for(out_i=0; out_i<h->short_ref_count; out_i++){
3909             int best_i=INT_MIN;
3910             int best_poc=INT_MAX;
3911
3912             for(i=0; i<h->short_ref_count; i++){
3913                 const int poc= h->short_ref[i]->poc;
3914                 if(poc > limit && poc < best_poc){
3915                     best_poc= poc;
3916                     best_i= i;
3917                 }
3918             }
3919
3920             assert(best_i != INT_MIN);
3921
3922             limit= best_poc;
3923             sorted_short_ref[out_i]= *h->short_ref[best_i];
3924             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3925             if (-1 == smallest_poc_greater_than_current) {
3926                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3927                     smallest_poc_greater_than_current = out_i;
3928                 }
3929             }
3930         }
3931     }
3932
3933     if(s->picture_structure == PICT_FRAME){
3934         if(h->slice_type==B_TYPE){
3935             int list;
3936             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3937
3938             // find the largest poc
3939             for(list=0; list<2; list++){
3940                 int index = 0;
3941                 int j= -99;
3942                 int step= list ? -1 : 1;
3943
3944                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3945                     while(j<0 || j>= h->short_ref_count){
3946                         if(j != -99 && step == (list ? -1 : 1))
3947                             return -1;
3948                         step = -step;
3949                         j= smallest_poc_greater_than_current + (step>>1);
3950                     }
3951                     if(sorted_short_ref[j].reference != 3) continue;
3952                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3953                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3954                 }
3955
3956                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3957                     if(h->long_ref[i] == NULL) continue;
3958                     if(h->long_ref[i]->reference != 3) continue;
3959
3960                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3961                     h->default_ref_list[ list ][index++].pic_id= i;;
3962                 }
3963
3964                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3965                     // swap the two first elements of L1 when
3966                     // L0 and L1 are identical
3967                     Picture temp= h->default_ref_list[1][0];
3968                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3969                     h->default_ref_list[1][1] = temp;
3970                 }
3971
3972                 if(index < h->ref_count[ list ])
3973                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3974             }
3975         }else{
3976             int index=0;
3977             for(i=0; i<h->short_ref_count; i++){
3978                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3979                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3980                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3981             }
3982             for(i = 0; i < 16; i++){
3983                 if(h->long_ref[i] == NULL) continue;
3984                 if(h->long_ref[i]->reference != 3) continue;
3985                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3986                 h->default_ref_list[0][index++].pic_id= i;;
3987             }
3988             if(index < h->ref_count[0])
3989                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3990         }
3991     }else{ //FIELD
3992         if(h->slice_type==B_TYPE){
3993         }else{
3994             //FIXME second field balh
3995         }
3996     }
3997 #ifdef TRACE
3998     for (i=0; i<h->ref_count[0]; i++) {
3999         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
4000     }
4001     if(h->slice_type==B_TYPE){
4002         for (i=0; i<h->ref_count[1]; i++) {
4003             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
4004         }
4005     }
4006 #endif
4007     return 0;
4008 }
4009
4010 static void print_short_term(H264Context *h);
4011 static void print_long_term(H264Context *h);
4012
4013 static int decode_ref_pic_list_reordering(H264Context *h){
4014     MpegEncContext * const s = &h->s;
4015     int list, index;
4016
4017     print_short_term(h);
4018     print_long_term(h);
4019     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
4020
4021     for(list=0; list<2; list++){
4022         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
4023
4024         if(get_bits1(&s->gb)){
4025             int pred= h->curr_pic_num;
4026
4027             for(index=0; ; index++){
4028                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
4029                 int pic_id;
4030                 int i;
4031                 Picture *ref = NULL;
4032
4033                 if(reordering_of_pic_nums_idc==3)
4034                     break;
4035
4036                 if(index >= h->ref_count[list]){
4037                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
4038                     return -1;
4039                 }
4040
4041                 if(reordering_of_pic_nums_idc<3){
4042                     if(reordering_of_pic_nums_idc<2){
4043                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
4044
4045                         if(abs_diff_pic_num >= h->max_pic_num){
4046                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
4047                             return -1;
4048                         }
4049
4050                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
4051                         else                                pred+= abs_diff_pic_num;
4052                         pred &= h->max_pic_num - 1;
4053
4054                         for(i= h->short_ref_count-1; i>=0; i--){
4055                             ref = h->short_ref[i];
4056                             assert(ref->reference == 3);
4057                             assert(!ref->long_ref);
4058                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
4059                                 break;
4060                         }
4061                         if(i>=0)
4062                             ref->pic_id= ref->frame_num;
4063                     }else{
4064                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
4065                         ref = h->long_ref[pic_id];
4066                         ref->pic_id= pic_id;
4067                         assert(ref->reference == 3);
4068                         assert(ref->long_ref);
4069                         i=0;
4070                     }
4071
4072                     if (i < 0) {
4073                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
4074                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
4075                     } else {
4076                         for(i=index; i+1<h->ref_count[list]; i++){
4077                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
4078                                 break;
4079                         }
4080                         for(; i > index; i--){
4081                             h->ref_list[list][i]= h->ref_list[list][i-1];
4082                         }
4083                         h->ref_list[list][index]= *ref;
4084                     }
4085                 }else{
4086                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4087                     return -1;
4088                 }
4089             }
4090         }
4091
4092         if(h->slice_type!=B_TYPE) break;
4093     }
4094     for(list=0; list<2; list++){
4095         for(index= 0; index < h->ref_count[list]; index++){
4096             if(!h->ref_list[list][index].data[0])
4097                 h->ref_list[list][index]= s->current_picture;
4098         }
4099         if(h->slice_type!=B_TYPE) break;
4100     }
4101
4102     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4103         direct_dist_scale_factor(h);
4104     direct_ref_list_init(h);
4105     return 0;
4106 }
4107
4108 static void fill_mbaff_ref_list(H264Context *h){
4109     int list, i, j;
4110     for(list=0; list<2; list++){
4111         for(i=0; i<h->ref_count[list]; i++){
4112             Picture *frame = &h->ref_list[list][i];
4113             Picture *field = &h->ref_list[list][16+2*i];
4114             field[0] = *frame;
4115             for(j=0; j<3; j++)
4116                 field[0].linesize[j] <<= 1;
4117             field[1] = field[0];
4118             for(j=0; j<3; j++)
4119                 field[1].data[j] += frame->linesize[j];
4120
4121             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4122             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4123             for(j=0; j<2; j++){
4124                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4125                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4126             }
4127         }
4128     }
4129     for(j=0; j<h->ref_count[1]; j++){
4130         for(i=0; i<h->ref_count[0]; i++)
4131             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4132         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4133         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4134     }
4135 }
4136
4137 static int pred_weight_table(H264Context *h){
4138     MpegEncContext * const s = &h->s;
4139     int list, i;
4140     int luma_def, chroma_def;
4141
4142     h->use_weight= 0;
4143     h->use_weight_chroma= 0;
4144     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4145     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4146     luma_def = 1<<h->luma_log2_weight_denom;
4147     chroma_def = 1<<h->chroma_log2_weight_denom;
4148
4149     for(list=0; list<2; list++){
4150         for(i=0; i<h->ref_count[list]; i++){
4151             int luma_weight_flag, chroma_weight_flag;
4152
4153             luma_weight_flag= get_bits1(&s->gb);
4154             if(luma_weight_flag){
4155                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4156                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4157                 if(   h->luma_weight[list][i] != luma_def
4158                    || h->luma_offset[list][i] != 0)
4159                     h->use_weight= 1;
4160             }else{
4161                 h->luma_weight[list][i]= luma_def;
4162                 h->luma_offset[list][i]= 0;
4163             }
4164
4165             chroma_weight_flag= get_bits1(&s->gb);
4166             if(chroma_weight_flag){
4167                 int j;
4168                 for(j=0; j<2; j++){
4169                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4170                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4171                     if(   h->chroma_weight[list][i][j] != chroma_def
4172                        || h->chroma_offset[list][i][j] != 0)
4173                         h->use_weight_chroma= 1;
4174                 }
4175             }else{
4176                 int j;
4177                 for(j=0; j<2; j++){
4178                     h->chroma_weight[list][i][j]= chroma_def;
4179                     h->chroma_offset[list][i][j]= 0;
4180                 }
4181             }
4182         }
4183         if(h->slice_type != B_TYPE) break;
4184     }
4185     h->use_weight= h->use_weight || h->use_weight_chroma;
4186     return 0;
4187 }
4188
4189 static void implicit_weight_table(H264Context *h){
4190     MpegEncContext * const s = &h->s;
4191     int ref0, ref1;
4192     int cur_poc = s->current_picture_ptr->poc;
4193
4194     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4195        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4196         h->use_weight= 0;
4197         h->use_weight_chroma= 0;
4198         return;
4199     }
4200
4201     h->use_weight= 2;
4202     h->use_weight_chroma= 2;
4203     h->luma_log2_weight_denom= 5;
4204     h->chroma_log2_weight_denom= 5;
4205
4206     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4207         int poc0 = h->ref_list[0][ref0].poc;
4208         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4209             int poc1 = h->ref_list[1][ref1].poc;
4210             int td = clip(poc1 - poc0, -128, 127);
4211             if(td){
4212                 int tb = clip(cur_poc - poc0, -128, 127);
4213                 int tx = (16384 + (ABS(td) >> 1)) / td;
4214                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4215                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4216                     h->implicit_weight[ref0][ref1] = 32;
4217                 else
4218                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4219             }else
4220                 h->implicit_weight[ref0][ref1] = 32;
4221         }
4222     }
4223 }
4224
4225 static inline void unreference_pic(H264Context *h, Picture *pic){
4226     int i;
4227     pic->reference=0;
4228     if(pic == h->delayed_output_pic)
4229         pic->reference=1;
4230     else{
4231         for(i = 0; h->delayed_pic[i]; i++)
4232             if(pic == h->delayed_pic[i]){
4233                 pic->reference=1;
4234                 break;
4235             }
4236     }
4237 }
4238
4239 /**
4240  * instantaneous decoder refresh.
4241  */
4242 static void idr(H264Context *h){
4243     int i;
4244
4245     for(i=0; i<16; i++){
4246         if (h->long_ref[i] != NULL) {
4247             unreference_pic(h, h->long_ref[i]);
4248             h->long_ref[i]= NULL;
4249         }
4250     }
4251     h->long_ref_count=0;
4252
4253     for(i=0; i<h->short_ref_count; i++){
4254         unreference_pic(h, h->short_ref[i]);
4255         h->short_ref[i]= NULL;
4256     }
4257     h->short_ref_count=0;
4258 }
4259
4260 /* forget old pics after a seek */
4261 static void flush_dpb(AVCodecContext *avctx){
4262     H264Context *h= avctx->priv_data;
4263     int i;
4264     for(i=0; i<16; i++) {
4265         if(h->delayed_pic[i])
4266             h->delayed_pic[i]->reference= 0;
4267         h->delayed_pic[i]= NULL;
4268     }
4269     if(h->delayed_output_pic)
4270         h->delayed_output_pic->reference= 0;
4271     h->delayed_output_pic= NULL;
4272     idr(h);
4273     if(h->s.current_picture_ptr)
4274         h->s.current_picture_ptr->reference= 0;
4275 }
4276
4277 /**
4278  *
4279  * @return the removed picture or NULL if an error occurs
4280  */
4281 static Picture * remove_short(H264Context *h, int frame_num){
4282     MpegEncContext * const s = &h->s;
4283     int i;
4284
4285     if(s->avctx->debug&FF_DEBUG_MMCO)
4286         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4287
4288     for(i=0; i<h->short_ref_count; i++){
4289         Picture *pic= h->short_ref[i];
4290         if(s->avctx->debug&FF_DEBUG_MMCO)
4291             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4292         if(pic->frame_num == frame_num){
4293             h->short_ref[i]= NULL;
4294             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4295             h->short_ref_count--;
4296             return pic;
4297         }
4298     }
4299     return NULL;
4300 }
4301
4302 /**
4303  *
4304  * @return the removed picture or NULL if an error occurs
4305  */
4306 static Picture * remove_long(H264Context *h, int i){
4307     Picture *pic;
4308
4309     pic= h->long_ref[i];
4310     h->long_ref[i]= NULL;
4311     if(pic) h->long_ref_count--;
4312
4313     return pic;
4314 }
4315
4316 /**
4317  * print short term list
4318  */
4319 static void print_short_term(H264Context *h) {
4320     uint32_t i;
4321     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4322         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4323         for(i=0; i<h->short_ref_count; i++){
4324             Picture *pic= h->short_ref[i];
4325             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4326         }
4327     }
4328 }
4329
4330 /**
4331  * print long term list
4332  */
4333 static void print_long_term(H264Context *h) {
4334     uint32_t i;
4335     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4336         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4337         for(i = 0; i < 16; i++){
4338             Picture *pic= h->long_ref[i];
4339             if (pic) {
4340                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4341             }
4342         }
4343     }
4344 }
4345
4346 /**
4347  * Executes the reference picture marking (memory management control operations).
4348  */
4349 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4350     MpegEncContext * const s = &h->s;
4351     int i, j;
4352     int current_is_long=0;
4353     Picture *pic;
4354
4355     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4356         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4357
4358     for(i=0; i<mmco_count; i++){
4359         if(s->avctx->debug&FF_DEBUG_MMCO)
4360             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4361
4362         switch(mmco[i].opcode){
4363         case MMCO_SHORT2UNUSED:
4364             pic= remove_short(h, mmco[i].short_frame_num);
4365             if(pic)
4366                 unreference_pic(h, pic);
4367             else if(s->avctx->debug&FF_DEBUG_MMCO)
4368                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4369             break;
4370         case MMCO_SHORT2LONG:
4371             pic= remove_long(h, mmco[i].long_index);
4372             if(pic) unreference_pic(h, pic);
4373
4374             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4375             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4376             h->long_ref_count++;
4377             break;
4378         case MMCO_LONG2UNUSED:
4379             pic= remove_long(h, mmco[i].long_index);
4380             if(pic)
4381                 unreference_pic(h, pic);
4382             else if(s->avctx->debug&FF_DEBUG_MMCO)
4383                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4384             break;
4385         case MMCO_LONG:
4386             pic= remove_long(h, mmco[i].long_index);
4387             if(pic) unreference_pic(h, pic);
4388
4389             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4390             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4391             h->long_ref_count++;
4392
4393             current_is_long=1;
4394             break;
4395         case MMCO_SET_MAX_LONG:
4396             assert(mmco[i].long_index <= 16);
4397             // just remove the long term which index is greater than new max
4398             for(j = mmco[i].long_index; j<16; j++){
4399                 pic = remove_long(h, j);
4400                 if (pic) unreference_pic(h, pic);
4401             }
4402             break;
4403         case MMCO_RESET:
4404             while(h->short_ref_count){
4405                 pic= remove_short(h, h->short_ref[0]->frame_num);
4406                 unreference_pic(h, pic);
4407             }
4408             for(j = 0; j < 16; j++) {
4409                 pic= remove_long(h, j);
4410                 if(pic) unreference_pic(h, pic);
4411             }
4412             break;
4413         default: assert(0);
4414         }
4415     }
4416
4417     if(!current_is_long){
4418         pic= remove_short(h, s->current_picture_ptr->frame_num);
4419         if(pic){
4420             unreference_pic(h, pic);
4421             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4422         }
4423
4424         if(h->short_ref_count)
4425             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4426
4427         h->short_ref[0]= s->current_picture_ptr;
4428         h->short_ref[0]->long_ref=0;
4429         h->short_ref_count++;
4430     }
4431
4432     print_short_term(h);
4433     print_long_term(h);
4434     return 0;
4435 }
4436
4437 static int decode_ref_pic_marking(H264Context *h){
4438     MpegEncContext * const s = &h->s;
4439     int i;
4440
4441     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4442         s->broken_link= get_bits1(&s->gb) -1;
4443         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4444         if(h->mmco[0].long_index == -1)
4445             h->mmco_index= 0;
4446         else{
4447             h->mmco[0].opcode= MMCO_LONG;
4448             h->mmco_index= 1;
4449         }
4450     }else{
4451         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4452             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4453                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4454
4455                 h->mmco[i].opcode= opcode;
4456                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4457                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4458 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4459                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4460                         return -1;
4461                     }*/
4462                 }
4463                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4464                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4465                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4466                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4467                         return -1;
4468                     }
4469                 }
4470
4471                 if(opcode > MMCO_LONG){
4472                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4473                     return -1;
4474                 }
4475                 if(opcode == MMCO_END)
4476                     break;
4477             }
4478             h->mmco_index= i;
4479         }else{
4480             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4481
4482             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4483                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4484                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4485                 h->mmco_index= 1;
4486             }else
4487                 h->mmco_index= 0;
4488         }
4489     }
4490
4491     return 0;
4492 }
4493
4494 static int init_poc(H264Context *h){
4495     MpegEncContext * const s = &h->s;
4496     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4497     int field_poc[2];
4498
4499     if(h->nal_unit_type == NAL_IDR_SLICE){
4500         h->frame_num_offset= 0;
4501     }else{
4502         if(h->frame_num < h->prev_frame_num)
4503             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4504         else
4505             h->frame_num_offset= h->prev_frame_num_offset;
4506     }
4507
4508     if(h->sps.poc_type==0){
4509         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4510
4511         if(h->nal_unit_type == NAL_IDR_SLICE){
4512              h->prev_poc_msb=
4513              h->prev_poc_lsb= 0;
4514         }
4515
4516         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4517             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4518         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4519             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4520         else
4521             h->poc_msb = h->prev_poc_msb;
4522 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4523         field_poc[0] =
4524         field_poc[1] = h->poc_msb + h->poc_lsb;
4525         if(s->picture_structure == PICT_FRAME)
4526             field_poc[1] += h->delta_poc_bottom;
4527     }else if(h->sps.poc_type==1){
4528         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4529         int i;
4530
4531         if(h->sps.poc_cycle_length != 0)
4532             abs_frame_num = h->frame_num_offset + h->frame_num;
4533         else
4534             abs_frame_num = 0;
4535
4536         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4537             abs_frame_num--;
4538
4539         expected_delta_per_poc_cycle = 0;
4540         for(i=0; i < h->sps.poc_cycle_length; i++)
4541             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4542
4543         if(abs_frame_num > 0){
4544             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4545             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4546
4547             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4548             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4549                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4550         } else
4551             expectedpoc = 0;
4552
4553         if(h->nal_ref_idc == 0)
4554             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4555
4556         field_poc[0] = expectedpoc + h->delta_poc[0];
4557         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4558
4559         if(s->picture_structure == PICT_FRAME)
4560             field_poc[1] += h->delta_poc[1];
4561     }else{
4562         int poc;
4563         if(h->nal_unit_type == NAL_IDR_SLICE){
4564             poc= 0;
4565         }else{
4566             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4567             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4568         }
4569         field_poc[0]= poc;
4570         field_poc[1]= poc;
4571     }
4572
4573     if(s->picture_structure != PICT_BOTTOM_FIELD)
4574         s->current_picture_ptr->field_poc[0]= field_poc[0];
4575     if(s->picture_structure != PICT_TOP_FIELD)
4576         s->current_picture_ptr->field_poc[1]= field_poc[1];
4577     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4578         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4579
4580     return 0;
4581 }
4582
4583 /**
4584  * decodes a slice header.
4585  * this will allso call MPV_common_init() and frame_start() as needed
4586  */
4587 static int decode_slice_header(H264Context *h){
4588     MpegEncContext * const s = &h->s;
4589     int first_mb_in_slice, pps_id;
4590     int num_ref_idx_active_override_flag;
4591     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4592     int slice_type;
4593     int default_ref_list_done = 0;
4594
4595     s->current_picture.reference= h->nal_ref_idc != 0;
4596     s->dropable= h->nal_ref_idc == 0;
4597
4598     first_mb_in_slice= get_ue_golomb(&s->gb);
4599
4600     slice_type= get_ue_golomb(&s->gb);
4601     if(slice_type > 9){
4602         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4603         return -1;
4604     }
4605     if(slice_type > 4){
4606         slice_type -= 5;
4607         h->slice_type_fixed=1;
4608     }else
4609         h->slice_type_fixed=0;
4610
4611     slice_type= slice_type_map[ slice_type ];
4612     if (slice_type == I_TYPE
4613         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4614         default_ref_list_done = 1;
4615     }
4616     h->slice_type= slice_type;
4617
4618     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4619
4620     pps_id= get_ue_golomb(&s->gb);
4621     if(pps_id>255){
4622         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4623         return -1;
4624     }
4625     h->pps= h->pps_buffer[pps_id];
4626     if(h->pps.slice_group_count == 0){
4627         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4628         return -1;
4629     }
4630
4631     h->sps= h->sps_buffer[ h->pps.sps_id ];
4632     if(h->sps.log2_max_frame_num == 0){
4633         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4634         return -1;
4635     }
4636
4637     if(h->dequant_coeff_pps != pps_id){
4638         h->dequant_coeff_pps = pps_id;
4639         init_dequant_tables(h);
4640     }
4641
4642     s->mb_width= h->sps.mb_width;
4643     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4644
4645     h->b_stride=  s->mb_width*4;
4646     h->b8_stride= s->mb_width*2;
4647
4648     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4649     if(h->sps.frame_mbs_only_flag)
4650         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4651     else
4652         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4653
4654     if (s->context_initialized
4655         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4656         free_tables(h);
4657         MPV_common_end(s);
4658     }
4659     if (!s->context_initialized) {
4660         if (MPV_common_init(s) < 0)
4661             return -1;
4662
4663         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4664             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4665             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4666         }else{
4667             int i;
4668             for(i=0; i<16; i++){
4669 #define T(x) (x>>2) | ((x<<2) & 0xF)
4670                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4671                 h-> field_scan[i] = T( field_scan[i]);
4672 #undef T
4673             }
4674         }
4675         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4676             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4677             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4678             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4679             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4680         }else{
4681             int i;
4682             for(i=0; i<64; i++){
4683 #define T(x) (x>>3) | ((x&7)<<3)
4684                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4685                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4686                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4687                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4688 #undef T
4689             }
4690         }
4691         if(h->sps.transform_bypass){ //FIXME same ugly
4692             h->zigzag_scan_q0          = zigzag_scan;
4693             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4694             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4695             h->field_scan_q0           = field_scan;
4696             h->field_scan8x8_q0        = field_scan8x8;
4697             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4698         }else{
4699             h->zigzag_scan_q0          = h->zigzag_scan;
4700             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4701             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4702             h->field_scan_q0           = h->field_scan;
4703             h->field_scan8x8_q0        = h->field_scan8x8;
4704             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4705         }
4706
4707         alloc_tables(h);
4708
4709         s->avctx->width = s->width;
4710         s->avctx->height = s->height;
4711         s->avctx->sample_aspect_ratio= h->sps.sar;
4712         if(!s->avctx->sample_aspect_ratio.den)
4713             s->avctx->sample_aspect_ratio.den = 1;
4714
4715         if(h->sps.timing_info_present_flag){
4716             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4717             if(h->x264_build > 0 && h->x264_build < 44)
4718                 s->avctx->time_base.den *= 2;
4719             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4720                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4721         }
4722     }
4723
4724     if(h->slice_num == 0){
4725         if(frame_start(h) < 0)
4726             return -1;
4727     }
4728
4729     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4730     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4731
4732     h->mb_mbaff = 0;
4733     h->mb_aff_frame = 0;
4734     if(h->sps.frame_mbs_only_flag){
4735         s->picture_structure= PICT_FRAME;
4736     }else{
4737         if(get_bits1(&s->gb)) { //field_pic_flag
4738             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4739             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4740         } else {
4741             s->picture_structure= PICT_FRAME;
4742             h->mb_aff_frame = h->sps.mb_aff;
4743         }
4744     }
4745
4746     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4747     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4748     if(s->mb_y >= s->mb_height){
4749         return -1;
4750     }
4751
4752     if(s->picture_structure==PICT_FRAME){
4753         h->curr_pic_num=   h->frame_num;
4754         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4755     }else{
4756         h->curr_pic_num= 2*h->frame_num;
4757         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4758     }
4759
4760     if(h->nal_unit_type == NAL_IDR_SLICE){
4761         get_ue_golomb(&s->gb); /* idr_pic_id */
4762     }
4763
4764     if(h->sps.poc_type==0){
4765         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4766
4767         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4768             h->delta_poc_bottom= get_se_golomb(&s->gb);
4769         }
4770     }
4771
4772     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4773         h->delta_poc[0]= get_se_golomb(&s->gb);
4774
4775         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4776             h->delta_poc[1]= get_se_golomb(&s->gb);
4777     }
4778
4779     init_poc(h);
4780
4781     if(h->pps.redundant_pic_cnt_present){
4782         h->redundant_pic_count= get_ue_golomb(&s->gb);
4783     }
4784
4785     //set defaults, might be overriden a few line later
4786     h->ref_count[0]= h->pps.ref_count[0];
4787     h->ref_count[1]= h->pps.ref_count[1];
4788
4789     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4790         if(h->slice_type == B_TYPE){
4791             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4792             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4793                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4794         }
4795         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4796
4797         if(num_ref_idx_active_override_flag){
4798             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4799             if(h->slice_type==B_TYPE)
4800                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4801
4802             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4803                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4804                 return -1;
4805             }
4806         }
4807     }
4808
4809     if(!default_ref_list_done){
4810         fill_default_ref_list(h);
4811     }
4812
4813     if(decode_ref_pic_list_reordering(h) < 0)
4814         return -1;
4815
4816     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4817        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4818         pred_weight_table(h);
4819     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4820         implicit_weight_table(h);
4821     else
4822         h->use_weight = 0;
4823
4824     if(s->current_picture.reference)
4825         decode_ref_pic_marking(h);
4826
4827     if(FRAME_MBAFF)
4828         fill_mbaff_ref_list(h);
4829
4830     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4831         h->cabac_init_idc = get_ue_golomb(&s->gb);
4832
4833     h->last_qscale_diff = 0;
4834     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4835     if(s->qscale<0 || s->qscale>51){
4836         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4837         return -1;
4838     }
4839     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4840     //FIXME qscale / qp ... stuff
4841     if(h->slice_type == SP_TYPE){
4842         get_bits1(&s->gb); /* sp_for_switch_flag */
4843     }
4844     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4845         get_se_golomb(&s->gb); /* slice_qs_delta */
4846     }
4847
4848     h->deblocking_filter = 1;
4849     h->slice_alpha_c0_offset = 0;
4850     h->slice_beta_offset = 0;
4851     if( h->pps.deblocking_filter_parameters_present ) {
4852         h->deblocking_filter= get_ue_golomb(&s->gb);
4853         if(h->deblocking_filter < 2)
4854             h->deblocking_filter^= 1; // 1<->0
4855
4856         if( h->deblocking_filter ) {
4857             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4858             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4859         }
4860     }
4861     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4862        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4863        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4864        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4865         h->deblocking_filter= 0;
4866
4867 #if 0 //FMO
4868     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4869         slice_group_change_cycle= get_bits(&s->gb, ?);
4870 #endif
4871
4872     h->slice_num++;
4873
4874     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4875     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4876
4877     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4878         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4879                h->slice_num,
4880                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4881                first_mb_in_slice,
4882                av_get_pict_type_char(h->slice_type),
4883                pps_id, h->frame_num,
4884                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4885                h->ref_count[0], h->ref_count[1],
4886                s->qscale,
4887                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4888                h->use_weight,
4889                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4890                );
4891     }
4892
4893     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4894         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4895         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4896     }else{
4897         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4898         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4899     }
4900
4901     return 0;
4902 }
4903
4904 /**
4905  *
4906  */
4907 static inline int get_level_prefix(GetBitContext *gb){
4908     unsigned int buf;
4909     int log;
4910
4911     OPEN_READER(re, gb);
4912     UPDATE_CACHE(re, gb);
4913     buf=GET_CACHE(re, gb);
4914
4915     log= 32 - av_log2(buf);
4916 #ifdef TRACE
4917     print_bin(buf>>(32-log), log);
4918     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4919 #endif
4920
4921     LAST_SKIP_BITS(re, gb, log);
4922     CLOSE_READER(re, gb);
4923
4924     return log-1;
4925 }
4926
4927 static inline int get_dct8x8_allowed(H264Context *h){
4928     int i;
4929     for(i=0; i<4; i++){
4930         if(!IS_SUB_8X8(h->sub_mb_type[i])
4931            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4932             return 0;
4933     }
4934     return 1;
4935 }
4936
4937 /**
4938  * decodes a residual block.
4939  * @param n block index
4940  * @param scantable scantable
4941  * @param max_coeff number of coefficients in the block
4942  * @return <0 if an error occured
4943  */
4944 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4945     MpegEncContext * const s = &h->s;
4946     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4947     int level[16];
4948     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4949
4950     //FIXME put trailing_onex into the context
4951
4952     if(n == CHROMA_DC_BLOCK_INDEX){
4953         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4954         total_coeff= coeff_token>>2;
4955     }else{
4956         if(n == LUMA_DC_BLOCK_INDEX){
4957             total_coeff= pred_non_zero_count(h, 0);
4958             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4959             total_coeff= coeff_token>>2;
4960         }else{
4961             total_coeff= pred_non_zero_count(h, n);
4962             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4963             total_coeff= coeff_token>>2;
4964             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4965         }
4966     }
4967
4968     //FIXME set last_non_zero?
4969
4970     if(total_coeff==0)
4971         return 0;
4972
4973     trailing_ones= coeff_token&3;
4974     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4975     assert(total_coeff<=16);
4976
4977     for(i=0; i<trailing_ones; i++){
4978         level[i]= 1 - 2*get_bits1(gb);
4979     }
4980
4981     if(i<total_coeff) {
4982         int level_code, mask;
4983         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4984         int prefix= get_level_prefix(gb);
4985
4986         //first coefficient has suffix_length equal to 0 or 1
4987         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4988             if(suffix_length)
4989                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4990             else
4991                 level_code= (prefix<<suffix_length); //part
4992         }else if(prefix==14){
4993             if(suffix_length)
4994                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4995             else
4996                 level_code= prefix + get_bits(gb, 4); //part
4997         }else if(prefix==15){
4998             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4999             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
5000         }else{
5001             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5002             return -1;
5003         }
5004
5005         if(trailing_ones < 3) level_code += 2;
5006
5007         suffix_length = 1;
5008         if(level_code > 5)
5009             suffix_length++;
5010         mask= -(level_code&1);
5011         level[i]= (((2+level_code)>>1) ^ mask) - mask;
5012         i++;
5013
5014         //remaining coefficients have suffix_length > 0
5015         for(;i<total_coeff;i++) {
5016             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
5017             prefix = get_level_prefix(gb);
5018             if(prefix<15){
5019                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
5020             }else if(prefix==15){
5021                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
5022             }else{
5023                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5024                 return -1;
5025             }
5026             mask= -(level_code&1);
5027             level[i]= (((2+level_code)>>1) ^ mask) - mask;
5028             if(level_code > suffix_limit[suffix_length])
5029                 suffix_length++;
5030         }
5031     }
5032
5033     if(total_coeff == max_coeff)
5034         zeros_left=0;
5035     else{
5036         if(n == CHROMA_DC_BLOCK_INDEX)
5037             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
5038         else
5039             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
5040     }
5041
5042     coeff_num = zeros_left + total_coeff - 1;
5043     j = scantable[coeff_num];
5044     if(n > 24){
5045         block[j] = level[0];
5046         for(i=1;i<total_coeff;i++) {
5047             if(zeros_left <= 0)
5048                 run_before = 0;
5049             else if(zeros_left < 7){
5050                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5051             }else{
5052                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5053             }
5054             zeros_left -= run_before;
5055             coeff_num -= 1 + run_before;
5056             j= scantable[ coeff_num ];
5057
5058             block[j]= level[i];
5059         }
5060     }else{
5061         block[j] = (level[0] * qmul[j] + 32)>>6;
5062         for(i=1;i<total_coeff;i++) {
5063             if(zeros_left <= 0)
5064                 run_before = 0;
5065             else if(zeros_left < 7){
5066                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5067             }else{
5068                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5069             }
5070             zeros_left -= run_before;
5071             coeff_num -= 1 + run_before;
5072             j= scantable[ coeff_num ];
5073
5074             block[j]= (level[i] * qmul[j] + 32)>>6;
5075         }
5076     }
5077
5078     if(zeros_left<0){
5079         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5080         return -1;
5081     }
5082
5083     return 0;
5084 }
5085
5086 static void predict_field_decoding_flag(H264Context *h){
5087     MpegEncContext * const s = &h->s;
5088     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5089     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5090                 ? s->current_picture.mb_type[mb_xy-1]
5091                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5092                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
5093                 : 0;
5094     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5095 }
5096
5097 /**
5098  * decodes a P_SKIP or B_SKIP macroblock
5099  */
5100 static void decode_mb_skip(H264Context *h){
5101     MpegEncContext * const s = &h->s;
5102     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5103     int mb_type=0;
5104
5105     memset(h->non_zero_count[mb_xy], 0, 16);
5106     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5107
5108     if(MB_FIELD)
5109         mb_type|= MB_TYPE_INTERLACED;
5110
5111     if( h->slice_type == B_TYPE )
5112     {
5113         // just for fill_caches. pred_direct_motion will set the real mb_type
5114         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
5115
5116         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5117         pred_direct_motion(h, &mb_type);
5118         mb_type|= MB_TYPE_SKIP;
5119     }
5120     else
5121     {
5122         int mx, my;
5123         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5124
5125         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5126         pred_pskip_motion(h, &mx, &my);
5127         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5128         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5129     }
5130
5131     write_back_motion(h, mb_type);
5132     s->current_picture.mb_type[mb_xy]= mb_type;
5133     s->current_picture.qscale_table[mb_xy]= s->qscale;
5134     h->slice_table[ mb_xy ]= h->slice_num;
5135     h->prev_mb_skipped= 1;
5136 }
5137
5138 /**
5139  * decodes a macroblock
5140  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5141  */
5142 static int decode_mb_cavlc(H264Context *h){
5143     MpegEncContext * const s = &h->s;
5144     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5145     int mb_type, partition_count, cbp;
5146     int dct8x8_allowed= h->pps.transform_8x8_mode;
5147
5148     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5149
5150     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5151     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5152                 down the code */
5153     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5154         if(s->mb_skip_run==-1)
5155             s->mb_skip_run= get_ue_golomb(&s->gb);
5156
5157         if (s->mb_skip_run--) {
5158             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5159                 if(s->mb_skip_run==0)
5160                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5161                 else
5162                     predict_field_decoding_flag(h);
5163             }
5164             decode_mb_skip(h);
5165             return 0;
5166         }
5167     }
5168     if(FRAME_MBAFF){
5169         if( (s->mb_y&1) == 0 )
5170             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5171     }else
5172         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5173
5174     h->prev_mb_skipped= 0;
5175
5176     mb_type= get_ue_golomb(&s->gb);
5177     if(h->slice_type == B_TYPE){
5178         if(mb_type < 23){
5179             partition_count= b_mb_type_info[mb_type].partition_count;
5180             mb_type=         b_mb_type_info[mb_type].type;
5181         }else{
5182             mb_type -= 23;
5183             goto decode_intra_mb;
5184         }
5185     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5186         if(mb_type < 5){
5187             partition_count= p_mb_type_info[mb_type].partition_count;
5188             mb_type=         p_mb_type_info[mb_type].type;
5189         }else{
5190             mb_type -= 5;
5191             goto decode_intra_mb;
5192         }
5193     }else{
5194        assert(h->slice_type == I_TYPE);
5195 decode_intra_mb:
5196         if(mb_type > 25){
5197             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5198             return -1;
5199         }
5200         partition_count=0;
5201         cbp= i_mb_type_info[mb_type].cbp;
5202         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5203         mb_type= i_mb_type_info[mb_type].type;
5204     }
5205
5206     if(MB_FIELD)
5207         mb_type |= MB_TYPE_INTERLACED;
5208
5209     h->slice_table[ mb_xy ]= h->slice_num;
5210
5211     if(IS_INTRA_PCM(mb_type)){
5212         unsigned int x, y;
5213
5214         // we assume these blocks are very rare so we dont optimize it
5215         align_get_bits(&s->gb);
5216
5217         // The pixels are stored in the same order as levels in h->mb array.
5218         for(y=0; y<16; y++){
5219             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5220             for(x=0; x<16; x++){
5221                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5222                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5223             }
5224         }
5225         for(y=0; y<8; y++){
5226             const int index= 256 + 4*(y&3) + 32*(y>>2);
5227             for(x=0; x<8; x++){
5228                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5229                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5230             }
5231         }
5232         for(y=0; y<8; y++){
5233             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5234             for(x=0; x<8; x++){
5235                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5236                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5237             }
5238         }
5239
5240         // In deblocking, the quantizer is 0
5241         s->current_picture.qscale_table[mb_xy]= 0;
5242         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5243         // All coeffs are present
5244         memset(h->non_zero_count[mb_xy], 16, 16);
5245
5246         s->current_picture.mb_type[mb_xy]= mb_type;
5247         return 0;
5248     }
5249
5250     if(MB_MBAFF){
5251         h->ref_count[0] <<= 1;
5252         h->ref_count[1] <<= 1;
5253     }
5254
5255     fill_caches(h, mb_type, 0);
5256
5257     //mb_pred
5258     if(IS_INTRA(mb_type)){
5259 //            init_top_left_availability(h);
5260             if(IS_INTRA4x4(mb_type)){
5261                 int i;
5262                 int di = 1;
5263                 if(dct8x8_allowed && get_bits1(&s->gb)){
5264                     mb_type |= MB_TYPE_8x8DCT;
5265                     di = 4;
5266                 }
5267
5268 //                fill_intra4x4_pred_table(h);
5269                 for(i=0; i<16; i+=di){
5270                     int mode= pred_intra_mode(h, i);
5271
5272                     if(!get_bits1(&s->gb)){
5273                         const int rem_mode= get_bits(&s->gb, 3);
5274                         mode = rem_mode + (rem_mode >= mode);
5275                     }
5276
5277                     if(di==4)
5278                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5279                     else
5280                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5281                 }
5282                 write_back_intra_pred_mode(h);
5283                 if( check_intra4x4_pred_mode(h) < 0)
5284                     return -1;
5285             }else{
5286                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5287                 if(h->intra16x16_pred_mode < 0)
5288                     return -1;
5289             }
5290             h->chroma_pred_mode= get_ue_golomb(&s->gb);
5291
5292             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
5293             if(h->chroma_pred_mode < 0)
5294                 return -1;
5295     }else if(partition_count==4){
5296         int i, j, sub_partition_count[4], list, ref[2][4];
5297
5298         if(h->slice_type == B_TYPE){
5299             for(i=0; i<4; i++){
5300                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5301                 if(h->sub_mb_type[i] >=13){
5302                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5303                     return -1;
5304                 }
5305                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5306                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5307             }
5308             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5309                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5310                 pred_direct_motion(h, &mb_type);
5311                 h->ref_cache[0][scan8[4]] =
5312                 h->ref_cache[1][scan8[4]] =
5313                 h->ref_cache[0][scan8[12]] =
5314                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5315             }
5316         }else{
5317             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5318             for(i=0; i<4; i++){
5319                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5320                 if(h->sub_mb_type[i] >=4){
5321                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5322                     return -1;
5323                 }
5324                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5325                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5326             }
5327         }
5328
5329         for(list=0; list<2; list++){
5330             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5331             if(ref_count == 0) continue;
5332             for(i=0; i<4; i++){
5333                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5334                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5335                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5336                 }else{
5337                  //FIXME
5338                     ref[list][i] = -1;
5339                 }
5340             }
5341         }
5342
5343         if(dct8x8_allowed)
5344             dct8x8_allowed = get_dct8x8_allowed(h);
5345
5346         for(list=0; list<2; list++){
5347             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5348             if(ref_count == 0) continue;
5349
5350             for(i=0; i<4; i++){
5351                 if(IS_DIRECT(h->sub_mb_type[i])) {
5352                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5353                     continue;
5354                 }
5355                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5356                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5357
5358                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5359                     const int sub_mb_type= h->sub_mb_type[i];
5360                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5361                     for(j=0; j<sub_partition_count[i]; j++){
5362                         int mx, my;
5363                         const int index= 4*i + block_width*j;
5364                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5365                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5366                         mx += get_se_golomb(&s->gb);
5367                         my += get_se_golomb(&s->gb);
5368                         tprintf("final mv:%d %d\n", mx, my);
5369
5370                         if(IS_SUB_8X8(sub_mb_type)){
5371                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5372                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5373                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5374                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5375                         }else if(IS_SUB_8X4(sub_mb_type)){
5376                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5377                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5378                         }else if(IS_SUB_4X8(sub_mb_type)){
5379                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5380                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5381                         }else{
5382                             assert(IS_SUB_4X4(sub_mb_type));
5383                             mv_cache[ 0 ][0]= mx;
5384                             mv_cache[ 0 ][1]= my;
5385                         }
5386                     }
5387                 }else{
5388                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5389                     p[0] = p[1]=
5390                     p[8] = p[9]= 0;
5391                 }
5392             }
5393         }
5394     }else if(IS_DIRECT(mb_type)){
5395         pred_direct_motion(h, &mb_type);
5396         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5397     }else{
5398         int list, mx, my, i;
5399          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5400         if(IS_16X16(mb_type)){
5401             for(list=0; list<2; list++){
5402                 if(h->ref_count[list]>0){
5403                     if(IS_DIR(mb_type, 0, list)){
5404                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5405                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5406                     }else
5407                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5408                 }
5409             }
5410             for(list=0; list<2; list++){
5411                 if(IS_DIR(mb_type, 0, list)){
5412                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5413                     mx += get_se_golomb(&s->gb);
5414                     my += get_se_golomb(&s->gb);
5415                     tprintf("final mv:%d %d\n", mx, my);
5416
5417                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5418                 }else
5419                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5420             }
5421         }
5422         else if(IS_16X8(mb_type)){
5423             for(list=0; list<2; list++){
5424                 if(h->ref_count[list]>0){
5425                     for(i=0; i<2; i++){
5426                         if(IS_DIR(mb_type, i, list)){
5427                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5428                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5429                         }else
5430                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5431                     }
5432                 }
5433             }
5434             for(list=0; list<2; list++){
5435                 for(i=0; i<2; i++){
5436                     if(IS_DIR(mb_type, i, list)){
5437                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5438                         mx += get_se_golomb(&s->gb);
5439                         my += get_se_golomb(&s->gb);
5440                         tprintf("final mv:%d %d\n", mx, my);
5441
5442                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5443                     }else
5444                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5445                 }
5446             }
5447         }else{
5448             assert(IS_8X16(mb_type));
5449             for(list=0; list<2; list++){
5450                 if(h->ref_count[list]>0){
5451                     for(i=0; i<2; i++){
5452                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5453                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5454                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5455                         }else
5456                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5457                     }
5458                 }
5459             }
5460             for(list=0; list<2; list++){
5461                 for(i=0; i<2; i++){
5462                     if(IS_DIR(mb_type, i, list)){
5463                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5464                         mx += get_se_golomb(&s->gb);
5465                         my += get_se_golomb(&s->gb);
5466                         tprintf("final mv:%d %d\n", mx, my);
5467
5468                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5469                     }else
5470                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5471                 }
5472             }
5473         }
5474     }
5475
5476     if(IS_INTER(mb_type))
5477         write_back_motion(h, mb_type);
5478
5479     if(!IS_INTRA16x16(mb_type)){
5480         cbp= get_ue_golomb(&s->gb);
5481         if(cbp > 47){
5482             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5483             return -1;
5484         }
5485
5486         if(IS_INTRA4x4(mb_type))
5487             cbp= golomb_to_intra4x4_cbp[cbp];
5488         else
5489             cbp= golomb_to_inter_cbp[cbp];
5490     }
5491     h->cbp = cbp;
5492
5493     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5494         if(get_bits1(&s->gb))
5495             mb_type |= MB_TYPE_8x8DCT;
5496     }
5497     s->current_picture.mb_type[mb_xy]= mb_type;
5498
5499     if(cbp || IS_INTRA16x16(mb_type)){
5500         int i8x8, i4x4, chroma_idx;
5501         int chroma_qp, dquant;
5502         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5503         const uint8_t *scan, *scan8x8, *dc_scan;
5504
5505 //        fill_non_zero_count_cache(h);
5506
5507         if(IS_INTERLACED(mb_type)){
5508             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5509             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5510             dc_scan= luma_dc_field_scan;
5511         }else{
5512             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5513             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5514             dc_scan= luma_dc_zigzag_scan;
5515         }
5516
5517         dquant= get_se_golomb(&s->gb);
5518
5519         if( dquant > 25 || dquant < -26 ){
5520             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5521             return -1;
5522         }
5523
5524         s->qscale += dquant;
5525         if(((unsigned)s->qscale) > 51){
5526             if(s->qscale<0) s->qscale+= 52;
5527             else            s->qscale-= 52;
5528         }
5529
5530         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5531         if(IS_INTRA16x16(mb_type)){
5532             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5533                 return -1; //FIXME continue if partitioned and other return -1 too
5534             }
5535
5536             assert((cbp&15) == 0 || (cbp&15) == 15);
5537
5538             if(cbp&15){
5539                 for(i8x8=0; i8x8<4; i8x8++){
5540                     for(i4x4=0; i4x4<4; i4x4++){
5541                         const int index= i4x4 + 4*i8x8;
5542                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5543                             return -1;
5544                         }
5545                     }
5546                 }
5547             }else{
5548                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5549             }
5550         }else{
5551             for(i8x8=0; i8x8<4; i8x8++){
5552                 if(cbp & (1<<i8x8)){
5553                     if(IS_8x8DCT(mb_type)){
5554                         DCTELEM *buf = &h->mb[64*i8x8];
5555                         uint8_t *nnz;
5556                         for(i4x4=0; i4x4<4; i4x4++){
5557                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5558                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5559                                 return -1;
5560                         }
5561                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5562                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5563                     }else{
5564                         for(i4x4=0; i4x4<4; i4x4++){
5565                             const int index= i4x4 + 4*i8x8;
5566
5567                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5568                                 return -1;
5569                             }
5570                         }
5571                     }
5572                 }else{
5573                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5574                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5575                 }
5576             }
5577         }
5578
5579         if(cbp&0x30){
5580             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5581                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5582                     return -1;
5583                 }
5584         }
5585
5586         if(cbp&0x20){
5587             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5588                 for(i4x4=0; i4x4<4; i4x4++){
5589                     const int index= 16 + 4*chroma_idx + i4x4;
5590                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5591                         return -1;
5592                     }
5593                 }
5594             }
5595         }else{
5596             uint8_t * const nnz= &h->non_zero_count_cache[0];
5597             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5598             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5599         }
5600     }else{
5601         uint8_t * const nnz= &h->non_zero_count_cache[0];
5602         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5603         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5604         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5605     }
5606     s->current_picture.qscale_table[mb_xy]= s->qscale;
5607     write_back_non_zero_count(h);
5608
5609     if(MB_MBAFF){
5610         h->ref_count[0] >>= 1;
5611         h->ref_count[1] >>= 1;
5612     }
5613
5614     return 0;
5615 }
5616
5617 static int decode_cabac_field_decoding_flag(H264Context *h) {
5618     MpegEncContext * const s = &h->s;
5619     const int mb_x = s->mb_x;
5620     const int mb_y = s->mb_y & ~1;
5621     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5622     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5623
5624     unsigned int ctx = 0;
5625
5626     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5627         ctx += 1;
5628     }
5629     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5630         ctx += 1;
5631     }
5632
5633     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5634 }
5635
5636 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5637     uint8_t *state= &h->cabac_state[ctx_base];
5638     int mb_type;
5639
5640     if(intra_slice){
5641         MpegEncContext * const s = &h->s;
5642         const int mba_xy = h->left_mb_xy[0];
5643         const int mbb_xy = h->top_mb_xy;
5644         int ctx=0;
5645         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5646             ctx++;
5647         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5648             ctx++;
5649         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5650             return 0;   /* I4x4 */
5651         state += 2;
5652     }else{
5653         if( get_cabac( &h->cabac, &state[0] ) == 0 )
5654             return 0;   /* I4x4 */
5655     }
5656
5657     if( get_cabac_terminate( &h->cabac ) )
5658         return 25;  /* PCM */
5659
5660     mb_type = 1; /* I16x16 */
5661     mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5662     if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
5663         mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
5664     mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
5665     mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
5666     return mb_type;
5667 }
5668
5669 static int decode_cabac_mb_type( H264Context *h ) {
5670     MpegEncContext * const s = &h->s;
5671
5672     if( h->slice_type == I_TYPE ) {
5673         return decode_cabac_intra_mb_type(h, 3, 1);
5674     } else if( h->slice_type == P_TYPE ) {
5675         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5676             /* P-type */
5677             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5678                 /* P_L0_D16x16, P_8x8 */
5679                 return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
5680             } else {
5681                 /* P_L0_D8x16, P_L0_D16x8 */
5682                 return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
5683             }
5684         } else {
5685             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5686         }
5687     } else if( h->slice_type == B_TYPE ) {
5688         const int mba_xy = h->left_mb_xy[0];
5689         const int mbb_xy = h->top_mb_xy;
5690         int ctx = 0;
5691         int bits;
5692
5693         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5694             ctx++;
5695         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5696             ctx++;
5697
5698         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5699             return 0; /* B_Direct_16x16 */
5700
5701         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5702             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5703         }
5704
5705         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5706         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5707         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5708         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5709         if( bits < 8 )
5710             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5711         else if( bits == 13 ) {
5712             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5713         } else if( bits == 14 )
5714             return 11; /* B_L1_L0_8x16 */
5715         else if( bits == 15 )
5716             return 22; /* B_8x8 */
5717
5718         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
5719         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5720     } else {
5721         /* TODO SI/SP frames? */
5722         return -1;
5723     }
5724 }
5725
5726 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5727     MpegEncContext * const s = &h->s;
5728     int mba_xy, mbb_xy;
5729     int ctx = 0;
5730
5731     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5732         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5733         mba_xy = mb_xy - 1;
5734         if( (mb_y&1)
5735             && h->slice_table[mba_xy] == h->slice_num
5736             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5737             mba_xy += s->mb_stride;
5738         if( MB_FIELD ){
5739             mbb_xy = mb_xy - s->mb_stride;
5740             if( !(mb_y&1)
5741                 && h->slice_table[mbb_xy] == h->slice_num
5742                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5743                 mbb_xy -= s->mb_stride;
5744         }else
5745             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5746     }else{
5747         int mb_xy = mb_x + mb_y*s->mb_stride;
5748         mba_xy = mb_xy - 1;
5749         mbb_xy = mb_xy - s->mb_stride;
5750     }
5751
5752     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5753         ctx++;
5754     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5755         ctx++;
5756
5757     if( h->slice_type == B_TYPE )
5758         ctx += 13;
5759     return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5760 }
5761
5762 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5763     int mode = 0;
5764
5765     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5766         return pred_mode;
5767
5768     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5769     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5770     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5771
5772     if( mode >= pred_mode )
5773         return mode + 1;
5774     else
5775         return mode;
5776 }
5777
5778 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5779     const int mba_xy = h->left_mb_xy[0];
5780     const int mbb_xy = h->top_mb_xy;
5781
5782     int ctx = 0;
5783
5784     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5785     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5786         ctx++;
5787
5788     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5789         ctx++;
5790
5791     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5792         return 0;
5793
5794     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5795         return 1;
5796     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5797         return 2;
5798     else
5799         return 3;
5800 }
5801
5802 static const uint8_t block_idx_x[16] = {
5803     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5804 };
5805 static const uint8_t block_idx_y[16] = {
5806     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5807 };
5808 static const uint8_t block_idx_xy[4][4] = {
5809     { 0, 2, 8,  10},
5810     { 1, 3, 9,  11},
5811     { 4, 6, 12, 14},
5812     { 5, 7, 13, 15}
5813 };
5814
5815 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5816     int cbp = 0;
5817     int cbp_b = -1;
5818     int i8x8;
5819
5820     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5821         cbp_b = h->top_cbp;
5822         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5823     }
5824
5825     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5826         int cbp_a = -1;
5827         int x, y;
5828         int ctx = 0;
5829
5830         x = block_idx_x[4*i8x8];
5831         y = block_idx_y[4*i8x8];
5832
5833         if( x > 0 )
5834             cbp_a = cbp;
5835         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5836             cbp_a = h->left_cbp;
5837             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5838         }
5839
5840         if( y > 0 )
5841             cbp_b = cbp;
5842
5843         /* No need to test for skip as we put 0 for skip block */
5844         /* No need to test for IPCM as we put 1 for IPCM block */
5845         if( cbp_a >= 0 ) {
5846             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5847             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5848                 ctx++;
5849         }
5850
5851         if( cbp_b >= 0 ) {
5852             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5853             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5854                 ctx += 2;
5855         }
5856
5857         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5858             cbp |= 1 << i8x8;
5859         }
5860     }
5861     return cbp;
5862 }
5863 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5864     int ctx;
5865     int cbp_a, cbp_b;
5866
5867     cbp_a = (h->left_cbp>>4)&0x03;
5868     cbp_b = (h-> top_cbp>>4)&0x03;
5869
5870     ctx = 0;
5871     if( cbp_a > 0 ) ctx++;
5872     if( cbp_b > 0 ) ctx += 2;
5873     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5874         return 0;
5875
5876     ctx = 4;
5877     if( cbp_a == 2 ) ctx++;
5878     if( cbp_b == 2 ) ctx += 2;
5879     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5880 }
5881 static int decode_cabac_mb_dqp( H264Context *h) {
5882     MpegEncContext * const s = &h->s;
5883     int mbn_xy;
5884     int   ctx = 0;
5885     int   val = 0;
5886
5887     if( s->mb_x > 0 )
5888         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5889     else
5890         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5891
5892     if( h->last_qscale_diff != 0 )
5893         ctx++;
5894
5895     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5896         if( ctx < 2 )
5897             ctx = 2;
5898         else
5899             ctx = 3;
5900         val++;
5901         if(val > 102) //prevent infinite loop
5902             return INT_MIN;
5903     }
5904
5905     if( val&0x01 )
5906         return (val + 1)/2;
5907     else
5908         return -(val + 1)/2;
5909 }
5910 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5911     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5912         return 0;   /* 8x8 */
5913     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5914         return 1;   /* 8x4 */
5915     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5916         return 2;   /* 4x8 */
5917     return 3;       /* 4x4 */
5918 }
5919 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5920     int type;
5921     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5922         return 0;   /* B_Direct_8x8 */
5923     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5924         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5925     type = 3;
5926     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5927         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5928             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5929         type += 4;
5930     }
5931     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5932     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5933     return type;
5934 }
5935
5936 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5937     return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5938 }
5939
5940 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5941     int refa = h->ref_cache[list][scan8[n] - 1];
5942     int refb = h->ref_cache[list][scan8[n] - 8];
5943     int ref  = 0;
5944     int ctx  = 0;
5945
5946     if( h->slice_type == B_TYPE) {
5947         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5948             ctx++;
5949         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5950             ctx += 2;
5951     } else {
5952         if( refa > 0 )
5953             ctx++;
5954         if( refb > 0 )
5955             ctx += 2;
5956     }
5957
5958     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5959         ref++;
5960         if( ctx < 4 )
5961             ctx = 4;
5962         else
5963             ctx = 5;
5964     }
5965     return ref;
5966 }
5967
5968 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5969     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5970                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5971     int ctxbase = (l == 0) ? 40 : 47;
5972     int ctx, mvd;
5973
5974     if( amvd < 3 )
5975         ctx = 0;
5976     else if( amvd > 32 )
5977         ctx = 2;
5978     else
5979         ctx = 1;
5980
5981     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5982         return 0;
5983
5984     mvd= 1;
5985     ctx= 3;
5986     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5987         mvd++;
5988         if( ctx < 6 )
5989             ctx++;
5990     }
5991
5992     if( mvd >= 9 ) {
5993         int k = 3;
5994         while( get_cabac_bypass( &h->cabac ) ) {
5995             mvd += 1 << k;
5996             k++;
5997         }
5998         while( k-- ) {
5999             if( get_cabac_bypass( &h->cabac ) )
6000                 mvd += 1 << k;
6001         }
6002     }
6003     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
6004     else                                 return  mvd;
6005 }
6006
6007 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
6008     int nza, nzb;
6009     int ctx = 0;
6010
6011     if( cat == 0 ) {
6012         nza = h->left_cbp&0x100;
6013         nzb = h-> top_cbp&0x100;
6014     } else if( cat == 1 || cat == 2 ) {
6015         nza = h->non_zero_count_cache[scan8[idx] - 1];
6016         nzb = h->non_zero_count_cache[scan8[idx] - 8];
6017     } else if( cat == 3 ) {
6018         nza = (h->left_cbp>>(6+idx))&0x01;
6019         nzb = (h-> top_cbp>>(6+idx))&0x01;
6020     } else {
6021         assert(cat == 4);
6022         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
6023         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
6024     }
6025
6026     if( nza > 0 )
6027         ctx++;
6028
6029     if( nzb > 0 )
6030         ctx += 2;
6031
6032     return ctx + 4 * cat;
6033 }
6034
6035 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
6036     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
6037     static const int significant_coeff_flag_offset[2][6] = {
6038       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
6039       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
6040     };
6041     static const int last_coeff_flag_offset[2][6] = {
6042       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
6043       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
6044     };
6045     static const int coeff_abs_level_m1_offset[6] = {
6046         227+0, 227+10, 227+20, 227+30, 227+39, 426
6047     };
6048     static const int significant_coeff_flag_offset_8x8[2][63] = {
6049       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6050         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6051         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6052        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6053       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6054         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6055         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6056         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6057     };
6058     static const int last_coeff_flag_offset_8x8[63] = {
6059         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6060         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6061         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
6062         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
6063     };
6064
6065     int index[64];
6066
6067     int i, last;
6068     int coeff_count = 0;
6069
6070     int abslevel1 = 1;
6071     int abslevelgt1 = 0;
6072
6073     uint8_t *significant_coeff_ctx_base;
6074     uint8_t *last_coeff_ctx_base;
6075     uint8_t *abs_level_m1_ctx_base;
6076
6077     /* cat: 0-> DC 16x16  n = 0
6078      *      1-> AC 16x16  n = luma4x4idx
6079      *      2-> Luma4x4   n = luma4x4idx
6080      *      3-> DC Chroma n = iCbCr
6081      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6082      *      5-> Luma8x8   n = 4 * luma8x8idx
6083      */
6084
6085     /* read coded block flag */
6086     if( cat != 5 ) {
6087         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6088             if( cat == 1 || cat == 2 )
6089                 h->non_zero_count_cache[scan8[n]] = 0;
6090             else if( cat == 4 )
6091                 h->non_zero_count_cache[scan8[16+n]] = 0;
6092
6093             return 0;
6094         }
6095     }
6096
6097     significant_coeff_ctx_base = h->cabac_state
6098         + significant_coeff_flag_offset[MB_FIELD][cat];
6099     last_coeff_ctx_base = h->cabac_state
6100         + last_coeff_flag_offset[MB_FIELD][cat];
6101     abs_level_m1_ctx_base = h->cabac_state
6102         + coeff_abs_level_m1_offset[cat];
6103
6104     if( cat == 5 ) {
6105 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6106         for(last= 0; last < coefs; last++) { \
6107             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6108             if( get_cabac( &h->cabac, sig_ctx )) { \
6109                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6110                 index[coeff_count++] = last; \
6111                 if( get_cabac( &h->cabac, last_ctx ) ) { \
6112                     last= max_coeff; \
6113                     break; \
6114                 } \
6115             } \
6116         }
6117         const int *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6118         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6119     } else {
6120         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6121     }
6122     if( last == max_coeff -1 ) {
6123         index[coeff_count++] = last;
6124     }
6125     assert(coeff_count > 0);
6126
6127     if( cat == 0 )
6128         h->cbp_table[mb_xy] |= 0x100;
6129     else if( cat == 1 || cat == 2 )
6130         h->non_zero_count_cache[scan8[n]] = coeff_count;
6131     else if( cat == 3 )
6132         h->cbp_table[mb_xy] |= 0x40 << n;
6133     else if( cat == 4 )
6134         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6135     else {
6136         assert( cat == 5 );
6137         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6138     }
6139
6140     for( i = coeff_count - 1; i >= 0; i-- ) {
6141         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6142         int j= scantable[index[i]];
6143
6144         if( get_cabac( &h->cabac, ctx ) == 0 ) {
6145             if( !qmul ) {
6146                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
6147                 else                                block[j] =  1;
6148             }else{
6149                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
6150                 else                                block[j] = ( qmul[j] + 32) >> 6;
6151             }
6152
6153             abslevel1++;
6154         } else {
6155             int coeff_abs = 2;
6156             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6157             while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
6158                 coeff_abs++;
6159             }
6160
6161             if( coeff_abs >= 15 ) {
6162                 int j = 0;
6163                 while( get_cabac_bypass( &h->cabac ) ) {
6164                     coeff_abs += 1 << j;
6165                     j++;
6166                 }
6167
6168                 while( j-- ) {
6169                     if( get_cabac_bypass( &h->cabac ) )
6170                         coeff_abs += 1 << j ;
6171                 }
6172             }
6173
6174             if( !qmul ) {
6175                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
6176                 else                                block[j] =  coeff_abs;
6177             }else{
6178                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6179                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6180             }
6181
6182             abslevelgt1++;
6183         }
6184     }
6185     return 0;
6186 }
6187
6188 static void inline compute_mb_neighbors(H264Context *h)
6189 {
6190     MpegEncContext * const s = &h->s;
6191     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6192     h->top_mb_xy     = mb_xy - s->mb_stride;
6193     h->left_mb_xy[0] = mb_xy - 1;
6194     if(FRAME_MBAFF){
6195         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6196         const int top_pair_xy      = pair_xy     - s->mb_stride;
6197         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6198         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6199         const int curr_mb_frame_flag = !MB_FIELD;
6200         const int bottom = (s->mb_y & 1);
6201         if (bottom
6202                 ? !curr_mb_frame_flag // bottom macroblock
6203                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6204                 ) {
6205             h->top_mb_xy -= s->mb_stride;
6206         }
6207         if (left_mb_frame_flag != curr_mb_frame_flag) {
6208             h->left_mb_xy[0] = pair_xy - 1;
6209         }
6210     }
6211     return;
6212 }
6213
6214 /**
6215  * decodes a macroblock
6216  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6217  */
6218 static int decode_mb_cabac(H264Context *h) {
6219     MpegEncContext * const s = &h->s;
6220     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6221     int mb_type, partition_count, cbp = 0;
6222     int dct8x8_allowed= h->pps.transform_8x8_mode;
6223
6224     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6225
6226     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6227     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6228         int skip;
6229         /* a skipped mb needs the aff flag from the following mb */
6230         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6231             predict_field_decoding_flag(h);
6232         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6233             skip = h->next_mb_skipped;
6234         else
6235             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6236         /* read skip flags */
6237         if( skip ) {
6238             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6239                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6240                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6241                 if(h->next_mb_skipped)
6242                     predict_field_decoding_flag(h);
6243                 else
6244                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6245             }
6246
6247             decode_mb_skip(h);
6248
6249             h->cbp_table[mb_xy] = 0;
6250             h->chroma_pred_mode_table[mb_xy] = 0;
6251             h->last_qscale_diff = 0;
6252
6253             return 0;
6254
6255         }
6256     }
6257     if(FRAME_MBAFF){
6258         if( (s->mb_y&1) == 0 )
6259             h->mb_mbaff =
6260             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6261     }else
6262         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6263
6264     h->prev_mb_skipped = 0;
6265
6266     compute_mb_neighbors(h);
6267     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6268         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6269         return -1;
6270     }
6271
6272     if( h->slice_type == B_TYPE ) {
6273         if( mb_type < 23 ){
6274             partition_count= b_mb_type_info[mb_type].partition_count;
6275             mb_type=         b_mb_type_info[mb_type].type;
6276         }else{
6277             mb_type -= 23;
6278             goto decode_intra_mb;
6279         }
6280     } else if( h->slice_type == P_TYPE ) {
6281         if( mb_type < 5) {
6282             partition_count= p_mb_type_info[mb_type].partition_count;
6283             mb_type=         p_mb_type_info[mb_type].type;
6284         } else {
6285             mb_type -= 5;
6286             goto decode_intra_mb;
6287         }
6288     } else {
6289        assert(h->slice_type == I_TYPE);
6290 decode_intra_mb:
6291         partition_count = 0;
6292         cbp= i_mb_type_info[mb_type].cbp;
6293         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6294         mb_type= i_mb_type_info[mb_type].type;
6295     }
6296     if(MB_FIELD)
6297         mb_type |= MB_TYPE_INTERLACED;
6298
6299     h->slice_table[ mb_xy ]= h->slice_num;
6300
6301     if(IS_INTRA_PCM(mb_type)) {
6302         const uint8_t *ptr;
6303         unsigned int x, y;
6304
6305         // We assume these blocks are very rare so we dont optimize it.
6306         // FIXME The two following lines get the bitstream position in the cabac
6307         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6308         ptr= h->cabac.bytestream;
6309         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
6310
6311         // The pixels are stored in the same order as levels in h->mb array.
6312         for(y=0; y<16; y++){
6313             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6314             for(x=0; x<16; x++){
6315                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6316                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6317             }
6318         }
6319         for(y=0; y<8; y++){
6320             const int index= 256 + 4*(y&3) + 32*(y>>2);
6321             for(x=0; x<8; x++){
6322                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6323                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6324             }
6325         }
6326         for(y=0; y<8; y++){
6327             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6328             for(x=0; x<8; x++){
6329                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6330                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6331             }
6332         }
6333
6334         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6335
6336         // All blocks are present
6337         h->cbp_table[mb_xy] = 0x1ef;
6338         h->chroma_pred_mode_table[mb_xy] = 0;
6339         // In deblocking, the quantizer is 0
6340         s->current_picture.qscale_table[mb_xy]= 0;
6341         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6342         // All coeffs are present
6343         memset(h->non_zero_count[mb_xy], 16, 16);
6344         s->current_picture.mb_type[mb_xy]= mb_type;
6345         return 0;
6346     }
6347
6348     if(MB_MBAFF){
6349         h->ref_count[0] <<= 1;
6350         h->ref_count[1] <<= 1;
6351     }
6352
6353     fill_caches(h, mb_type, 0);
6354
6355     if( IS_INTRA( mb_type ) ) {
6356         int i;
6357         if( IS_INTRA4x4( mb_type ) ) {
6358             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6359                 mb_type |= MB_TYPE_8x8DCT;
6360                 for( i = 0; i < 16; i+=4 ) {
6361                     int pred = pred_intra_mode( h, i );
6362                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6363                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6364                 }
6365             } else {
6366                 for( i = 0; i < 16; i++ ) {
6367                     int pred = pred_intra_mode( h, i );
6368                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6369
6370                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6371                 }
6372             }
6373             write_back_intra_pred_mode(h);
6374             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6375         } else {
6376             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6377             if( h->intra16x16_pred_mode < 0 ) return -1;
6378         }
6379         h->chroma_pred_mode_table[mb_xy] =
6380             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
6381
6382         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
6383         if( h->chroma_pred_mode < 0 ) return -1;
6384     } else if( partition_count == 4 ) {
6385         int i, j, sub_partition_count[4], list, ref[2][4];
6386
6387         if( h->slice_type == B_TYPE ) {
6388             for( i = 0; i < 4; i++ ) {
6389                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6390                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6391                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6392             }
6393             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6394                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6395                 pred_direct_motion(h, &mb_type);
6396                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6397                     for( i = 0; i < 4; i++ )
6398                         if( IS_DIRECT(h->sub_mb_type[i]) )
6399                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6400                 }
6401             }
6402         } else {
6403             for( i = 0; i < 4; i++ ) {
6404                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6405                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6406                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6407             }
6408         }
6409
6410         for( list = 0; list < 2; list++ ) {
6411             if( h->ref_count[list] > 0 ) {
6412                 for( i = 0; i < 4; i++ ) {
6413                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6414                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6415                         if( h->ref_count[list] > 1 )
6416                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6417                         else
6418                             ref[list][i] = 0;
6419                     } else {
6420                         ref[list][i] = -1;
6421                     }
6422                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6423                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6424                 }
6425             }
6426         }
6427
6428         if(dct8x8_allowed)
6429             dct8x8_allowed = get_dct8x8_allowed(h);
6430
6431         for(list=0; list<2; list++){
6432             for(i=0; i<4; i++){
6433                 if(IS_DIRECT(h->sub_mb_type[i])){
6434                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6435                     continue;
6436                 }
6437                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6438
6439                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6440                     const int sub_mb_type= h->sub_mb_type[i];
6441                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6442                     for(j=0; j<sub_partition_count[i]; j++){
6443                         int mpx, mpy;
6444                         int mx, my;
6445                         const int index= 4*i + block_width*j;
6446                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6447                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6448                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6449
6450                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6451                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6452                         tprintf("final mv:%d %d\n", mx, my);
6453
6454                         if(IS_SUB_8X8(sub_mb_type)){
6455                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6456                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6457                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6458                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6459
6460                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6461                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6462                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6463                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6464                         }else if(IS_SUB_8X4(sub_mb_type)){
6465                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6466                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6467
6468                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6469                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6470                         }else if(IS_SUB_4X8(sub_mb_type)){
6471                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6472                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6473
6474                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6475                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6476                         }else{
6477                             assert(IS_SUB_4X4(sub_mb_type));
6478                             mv_cache[ 0 ][0]= mx;
6479                             mv_cache[ 0 ][1]= my;
6480
6481                             mvd_cache[ 0 ][0]= mx - mpx;
6482                             mvd_cache[ 0 ][1]= my - mpy;
6483                         }
6484                     }
6485                 }else{
6486                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6487                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6488                     p[0] = p[1] = p[8] = p[9] = 0;
6489                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6490                 }
6491             }
6492         }
6493     } else if( IS_DIRECT(mb_type) ) {
6494         pred_direct_motion(h, &mb_type);
6495         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6496         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6497         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6498     } else {
6499         int list, mx, my, i, mpx, mpy;
6500         if(IS_16X16(mb_type)){
6501             for(list=0; list<2; list++){
6502                 if(IS_DIR(mb_type, 0, list)){
6503                     if(h->ref_count[list] > 0 ){
6504                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6505                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6506                     }
6507                 }else
6508                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6509             }
6510             for(list=0; list<2; list++){
6511                 if(IS_DIR(mb_type, 0, list)){
6512                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6513
6514                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6515                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6516                     tprintf("final mv:%d %d\n", mx, my);
6517
6518                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6519                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6520                 }else
6521                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6522             }
6523         }
6524         else if(IS_16X8(mb_type)){
6525             for(list=0; list<2; list++){
6526                 if(h->ref_count[list]>0){
6527                     for(i=0; i<2; i++){
6528                         if(IS_DIR(mb_type, i, list)){
6529                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6530                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6531                         }else
6532                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6533                     }
6534                 }
6535             }
6536             for(list=0; list<2; list++){
6537                 for(i=0; i<2; i++){
6538                     if(IS_DIR(mb_type, i, list)){
6539                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6540                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6541                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6542                         tprintf("final mv:%d %d\n", mx, my);
6543
6544                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6545                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6546                     }else{
6547                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6548                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6549                     }
6550                 }
6551             }
6552         }else{
6553             assert(IS_8X16(mb_type));
6554             for(list=0; list<2; list++){
6555                 if(h->ref_count[list]>0){
6556                     for(i=0; i<2; i++){
6557                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6558                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6559                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6560                         }else
6561                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6562                     }
6563                 }
6564             }
6565             for(list=0; list<2; list++){
6566                 for(i=0; i<2; i++){
6567                     if(IS_DIR(mb_type, i, list)){
6568                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6569                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6570                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6571
6572                         tprintf("final mv:%d %d\n", mx, my);
6573                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6574                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6575                     }else{
6576                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6577                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6578                     }
6579                 }
6580             }
6581         }
6582     }
6583
6584    if( IS_INTER( mb_type ) ) {
6585         h->chroma_pred_mode_table[mb_xy] = 0;
6586         write_back_motion( h, mb_type );
6587    }
6588
6589     if( !IS_INTRA16x16( mb_type ) ) {
6590         cbp  = decode_cabac_mb_cbp_luma( h );
6591         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6592     }
6593
6594     h->cbp_table[mb_xy] = h->cbp = cbp;
6595
6596     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6597         if( decode_cabac_mb_transform_size( h ) )
6598             mb_type |= MB_TYPE_8x8DCT;
6599     }
6600     s->current_picture.mb_type[mb_xy]= mb_type;
6601
6602     if( cbp || IS_INTRA16x16( mb_type ) ) {
6603         const uint8_t *scan, *scan8x8, *dc_scan;
6604         int dqp;
6605
6606         if(IS_INTERLACED(mb_type)){
6607             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6608             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6609             dc_scan= luma_dc_field_scan;
6610         }else{
6611             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6612             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6613             dc_scan= luma_dc_zigzag_scan;
6614         }
6615
6616         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6617         if( dqp == INT_MIN ){
6618             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6619             return -1;
6620         }
6621         s->qscale += dqp;
6622         if(((unsigned)s->qscale) > 51){
6623             if(s->qscale<0) s->qscale+= 52;
6624             else            s->qscale-= 52;
6625         }
6626         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6627
6628         if( IS_INTRA16x16( mb_type ) ) {
6629             int i;
6630             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6631             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6632                 return -1;
6633             if( cbp&15 ) {
6634                 for( i = 0; i < 16; i++ ) {
6635                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6636                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6637                         return -1;
6638                 }
6639             } else {
6640                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6641             }
6642         } else {
6643             int i8x8, i4x4;
6644             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6645                 if( cbp & (1<<i8x8) ) {
6646                     if( IS_8x8DCT(mb_type) ) {
6647                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6648                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6649                             return -1;
6650                     } else
6651                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6652                         const int index = 4*i8x8 + i4x4;
6653                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6654                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6655                             return -1;
6656                     }
6657                 } else {
6658                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6659                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6660                 }
6661             }
6662         }
6663
6664         if( cbp&0x30 ){
6665             int c;
6666             for( c = 0; c < 2; c++ ) {
6667                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6668                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6669                     return -1;
6670             }
6671         }
6672
6673         if( cbp&0x20 ) {
6674             int c, i;
6675             for( c = 0; c < 2; c++ ) {
6676                 for( i = 0; i < 4; i++ ) {
6677                     const int index = 16 + 4 * c + i;
6678                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6679                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6680                         return -1;
6681                 }
6682             }
6683         } else {
6684             uint8_t * const nnz= &h->non_zero_count_cache[0];
6685             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6686             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6687         }
6688     } else {
6689         uint8_t * const nnz= &h->non_zero_count_cache[0];
6690         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6691         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6692         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6693         h->last_qscale_diff = 0;
6694     }
6695
6696     s->current_picture.qscale_table[mb_xy]= s->qscale;
6697     write_back_non_zero_count(h);
6698
6699     if(MB_MBAFF){
6700         h->ref_count[0] >>= 1;
6701         h->ref_count[1] >>= 1;
6702     }
6703
6704     return 0;
6705 }
6706
6707
6708 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6709     int i, d;
6710     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6711     const int alpha = alpha_table[index_a];
6712     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6713
6714     if( bS[0] < 4 ) {
6715         int8_t tc[4];
6716         for(i=0; i<4; i++)
6717             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6718         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6719     } else {
6720         /* 16px edge length, because bS=4 is triggered by being at
6721          * the edge of an intra MB, so all 4 bS are the same */
6722             for( d = 0; d < 16; d++ ) {
6723                 const int p0 = pix[-1];
6724                 const int p1 = pix[-2];
6725                 const int p2 = pix[-3];
6726
6727                 const int q0 = pix[0];
6728                 const int q1 = pix[1];
6729                 const int q2 = pix[2];
6730
6731                 if( ABS( p0 - q0 ) < alpha &&
6732                     ABS( p1 - p0 ) < beta &&
6733                     ABS( q1 - q0 ) < beta ) {
6734
6735                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6736                         if( ABS( p2 - p0 ) < beta)
6737                         {
6738                             const int p3 = pix[-4];
6739                             /* p0', p1', p2' */
6740                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6741                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6742                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6743                         } else {
6744                             /* p0' */
6745                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6746                         }
6747                         if( ABS( q2 - q0 ) < beta)
6748                         {
6749                             const int q3 = pix[3];
6750                             /* q0', q1', q2' */
6751                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6752                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6753                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6754                         } else {
6755                             /* q0' */
6756                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6757                         }
6758                     }else{
6759                         /* p0', q0' */
6760                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6761                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6762                     }
6763                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6764                 }
6765                 pix += stride;
6766             }
6767     }
6768 }
6769 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6770     int i;
6771     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6772     const int alpha = alpha_table[index_a];
6773     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6774
6775     if( bS[0] < 4 ) {
6776         int8_t tc[4];
6777         for(i=0; i<4; i++)
6778             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6779         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6780     } else {
6781         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6782     }
6783 }
6784
6785 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6786     int i;
6787     for( i = 0; i < 16; i++, pix += stride) {
6788         int index_a;
6789         int alpha;
6790         int beta;
6791
6792         int qp_index;
6793         int bS_index = (i >> 1);
6794         if (!MB_FIELD) {
6795             bS_index &= ~1;
6796             bS_index |= (i & 1);
6797         }
6798
6799         if( bS[bS_index] == 0 ) {
6800             continue;
6801         }
6802
6803         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6804         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6805         alpha = alpha_table[index_a];
6806         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6807
6808         if( bS[bS_index] < 4 ) {
6809             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6810             const int p0 = pix[-1];
6811             const int p1 = pix[-2];
6812             const int p2 = pix[-3];
6813             const int q0 = pix[0];
6814             const int q1 = pix[1];
6815             const int q2 = pix[2];
6816
6817             if( ABS( p0 - q0 ) < alpha &&
6818                 ABS( p1 - p0 ) < beta &&
6819                 ABS( q1 - q0 ) < beta ) {
6820                 int tc = tc0;
6821                 int i_delta;
6822
6823                 if( ABS( p2 - p0 ) < beta ) {
6824                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6825                     tc++;
6826                 }
6827                 if( ABS( q2 - q0 ) < beta ) {
6828                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6829                     tc++;
6830                 }
6831
6832                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6833                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6834                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6835                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6836             }
6837         }else{
6838             const int p0 = pix[-1];
6839             const int p1 = pix[-2];
6840             const int p2 = pix[-3];
6841
6842             const int q0 = pix[0];
6843             const int q1 = pix[1];
6844             const int q2 = pix[2];
6845
6846             if( ABS( p0 - q0 ) < alpha &&
6847                 ABS( p1 - p0 ) < beta &&
6848                 ABS( q1 - q0 ) < beta ) {
6849
6850                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6851                     if( ABS( p2 - p0 ) < beta)
6852                     {
6853                         const int p3 = pix[-4];
6854                         /* p0', p1', p2' */
6855                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6856                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6857                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6858                     } else {
6859                         /* p0' */
6860                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6861                     }
6862                     if( ABS( q2 - q0 ) < beta)
6863                     {
6864                         const int q3 = pix[3];
6865                         /* q0', q1', q2' */
6866                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6867                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6868                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6869                     } else {
6870                         /* q0' */
6871                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6872                     }
6873                 }else{
6874                     /* p0', q0' */
6875                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6876                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6877                 }
6878                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6879             }
6880         }
6881     }
6882 }
6883 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6884     int i;
6885     for( i = 0; i < 8; i++, pix += stride) {
6886         int index_a;
6887         int alpha;
6888         int beta;
6889
6890         int qp_index;
6891         int bS_index = i;
6892
6893         if( bS[bS_index] == 0 ) {
6894             continue;
6895         }
6896
6897         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6898         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6899         alpha = alpha_table[index_a];
6900         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6901
6902         if( bS[bS_index] < 4 ) {
6903             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6904             const int p0 = pix[-1];
6905             const int p1 = pix[-2];
6906             const int q0 = pix[0];
6907             const int q1 = pix[1];
6908
6909             if( ABS( p0 - q0 ) < alpha &&
6910                 ABS( p1 - p0 ) < beta &&
6911                 ABS( q1 - q0 ) < beta ) {
6912                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6913
6914                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6915                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6916                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6917             }
6918         }else{
6919             const int p0 = pix[-1];
6920             const int p1 = pix[-2];
6921             const int q0 = pix[0];
6922             const int q1 = pix[1];
6923
6924             if( ABS( p0 - q0 ) < alpha &&
6925                 ABS( p1 - p0 ) < beta &&
6926                 ABS( q1 - q0 ) < beta ) {
6927
6928                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6929                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6930                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6931             }
6932         }
6933     }
6934 }
6935
6936 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6937     int i, d;
6938     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6939     const int alpha = alpha_table[index_a];
6940     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6941     const int pix_next  = stride;
6942
6943     if( bS[0] < 4 ) {
6944         int8_t tc[4];
6945         for(i=0; i<4; i++)
6946             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6947         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6948     } else {
6949         /* 16px edge length, see filter_mb_edgev */
6950             for( d = 0; d < 16; d++ ) {
6951                 const int p0 = pix[-1*pix_next];
6952                 const int p1 = pix[-2*pix_next];
6953                 const int p2 = pix[-3*pix_next];
6954                 const int q0 = pix[0];
6955                 const int q1 = pix[1*pix_next];
6956                 const int q2 = pix[2*pix_next];
6957
6958                 if( ABS( p0 - q0 ) < alpha &&
6959                     ABS( p1 - p0 ) < beta &&
6960                     ABS( q1 - q0 ) < beta ) {
6961
6962                     const int p3 = pix[-4*pix_next];
6963                     const int q3 = pix[ 3*pix_next];
6964
6965                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6966                         if( ABS( p2 - p0 ) < beta) {
6967                             /* p0', p1', p2' */
6968                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6969                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6970                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6971                         } else {
6972                             /* p0' */
6973                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6974                         }
6975                         if( ABS( q2 - q0 ) < beta) {
6976                             /* q0', q1', q2' */
6977                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6978                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6979                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6980                         } else {
6981                             /* q0' */
6982                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6983                         }
6984                     }else{
6985                         /* p0', q0' */
6986                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6987                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6988                     }
6989                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6990                 }
6991                 pix++;
6992             }
6993     }
6994 }
6995
6996 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6997     int i;
6998     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6999     const int alpha = alpha_table[index_a];
7000     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
7001
7002     if( bS[0] < 4 ) {
7003         int8_t tc[4];
7004         for(i=0; i<4; i++)
7005             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
7006         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
7007     } else {
7008         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
7009     }
7010 }
7011
7012 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7013     MpegEncContext * const s = &h->s;
7014     int mb_xy, mb_type;
7015     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
7016
7017     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
7018         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
7019         return;
7020     }
7021     assert(!FRAME_MBAFF);
7022
7023     mb_xy = mb_x + mb_y*s->mb_stride;
7024     mb_type = s->current_picture.mb_type[mb_xy];
7025     qp = s->current_picture.qscale_table[mb_xy];
7026     qp0 = s->current_picture.qscale_table[mb_xy-1];
7027     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
7028     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
7029     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
7030     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
7031     qp0 = (qp + qp0 + 1) >> 1;
7032     qp1 = (qp + qp1 + 1) >> 1;
7033     qpc0 = (qpc + qpc0 + 1) >> 1;
7034     qpc1 = (qpc + qpc1 + 1) >> 1;
7035     qp_thresh = 15 - h->slice_alpha_c0_offset;
7036     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
7037        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
7038         return;
7039
7040     if( IS_INTRA(mb_type) ) {
7041         int16_t bS4[4] = {4,4,4,4};
7042         int16_t bS3[4] = {3,3,3,3};
7043         if( IS_8x8DCT(mb_type) ) {
7044             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7045             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7046             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7047             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7048         } else {
7049             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7050             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
7051             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7052             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
7053             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7054             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
7055             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7056             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
7057         }
7058         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
7059         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
7060         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
7061         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
7062         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7063         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
7064         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7065         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
7066         return;
7067     } else {
7068         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
7069         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
7070         int edges;
7071         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
7072             edges = 4;
7073             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
7074         } else {
7075             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
7076                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
7077             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
7078                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
7079                              ? 3 : 0;
7080             int step = IS_8x8DCT(mb_type) ? 2 : 1;
7081             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
7082             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
7083                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
7084         }
7085         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
7086             bSv[0][0] = 0x0004000400040004ULL;
7087         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
7088             bSv[1][0] = 0x0004000400040004ULL;
7089
7090 #define FILTER(hv,dir,edge)\
7091         if(bSv[dir][edge]) {\
7092             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
7093             if(!(edge&1)) {\
7094                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7095                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7096             }\
7097         }
7098         if( edges == 1 ) {
7099             FILTER(v,0,0);
7100             FILTER(h,1,0);
7101         } else if( IS_8x8DCT(mb_type) ) {
7102             FILTER(v,0,0);
7103             FILTER(v,0,2);
7104             FILTER(h,1,0);
7105             FILTER(h,1,2);
7106         } else {
7107             FILTER(v,0,0);
7108             FILTER(v,0,1);
7109             FILTER(v,0,2);
7110             FILTER(v,0,3);
7111             FILTER(h,1,0);
7112             FILTER(h,1,1);
7113             FILTER(h,1,2);
7114             FILTER(h,1,3);
7115         }
7116 #undef FILTER
7117     }
7118 }
7119
7120 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7121     MpegEncContext * const s = &h->s;
7122     const int mb_xy= mb_x + mb_y*s->mb_stride;
7123     const int mb_type = s->current_picture.mb_type[mb_xy];
7124     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7125     int first_vertical_edge_done = 0;
7126     int dir;
7127     /* FIXME: A given frame may occupy more than one position in
7128      * the reference list. So ref2frm should be populated with
7129      * frame numbers, not indices. */
7130     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7131                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7132
7133     //for sufficiently low qp, filtering wouldn't do anything
7134     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7135     if(!FRAME_MBAFF){
7136         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7137         int qp = s->current_picture.qscale_table[mb_xy];
7138         if(qp <= qp_thresh
7139            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7140            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7141             return;
7142         }
7143     }
7144
7145     if (FRAME_MBAFF
7146             // left mb is in picture
7147             && h->slice_table[mb_xy-1] != 255
7148             // and current and left pair do not have the same interlaced type
7149             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7150             // and left mb is in the same slice if deblocking_filter == 2
7151             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7152         /* First vertical edge is different in MBAFF frames
7153          * There are 8 different bS to compute and 2 different Qp
7154          */
7155         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7156         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7157         int16_t bS[8];
7158         int qp[2];
7159         int chroma_qp[2];
7160         int mb_qp, mbn0_qp, mbn1_qp;
7161         int i;
7162         first_vertical_edge_done = 1;
7163
7164         if( IS_INTRA(mb_type) )
7165             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7166         else {
7167             for( i = 0; i < 8; i++ ) {
7168                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7169
7170                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7171                     bS[i] = 4;
7172                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7173                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7174                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7175                     bS[i] = 2;
7176                 else
7177                     bS[i] = 1;
7178             }
7179         }
7180
7181         mb_qp = s->current_picture.qscale_table[mb_xy];
7182         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7183         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7184         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7185         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7186                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7187         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7188         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7189                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7190
7191         /* Filter edge */
7192         tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7193         { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7194         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7195         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7196         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7197     }
7198     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7199     for( dir = 0; dir < 2; dir++ )
7200     {
7201         int edge;
7202         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7203         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7204         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7205
7206         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7207                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7208         // how often to recheck mv-based bS when iterating between edges
7209         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7210                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7211         // how often to recheck mv-based bS when iterating along each edge
7212         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7213
7214         if (first_vertical_edge_done) {
7215             start = 1;
7216             first_vertical_edge_done = 0;
7217         }
7218
7219         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7220             start = 1;
7221
7222         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7223             && !IS_INTERLACED(mb_type)
7224             && IS_INTERLACED(mbm_type)
7225             ) {
7226             // This is a special case in the norm where the filtering must
7227             // be done twice (one each of the field) even if we are in a
7228             // frame macroblock.
7229             //
7230             static const int nnz_idx[4] = {4,5,6,3};
7231             unsigned int tmp_linesize   = 2 *   linesize;
7232             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7233             int mbn_xy = mb_xy - 2 * s->mb_stride;
7234             int qp, chroma_qp;
7235             int i, j;
7236             int16_t bS[4];
7237
7238             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7239                 if( IS_INTRA(mb_type) ||
7240                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7241                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7242                 } else {
7243                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7244                     for( i = 0; i < 4; i++ ) {
7245                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7246                             mbn_nnz[nnz_idx[i]] != 0 )
7247                             bS[i] = 2;
7248                         else
7249                             bS[i] = 1;
7250                     }
7251                 }
7252                 // Do not use s->qscale as luma quantizer because it has not the same
7253                 // value in IPCM macroblocks.
7254                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7255                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7256                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7257                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7258                 chroma_qp = ( h->chroma_qp +
7259                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7260                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7261                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7262             }
7263
7264             start = 1;
7265         }
7266
7267         /* Calculate bS */
7268         for( edge = start; edge < edges; edge++ ) {
7269             /* mbn_xy: neighbor macroblock */
7270             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7271             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7272             int16_t bS[4];
7273             int qp;
7274
7275             if( (edge&1) && IS_8x8DCT(mb_type) )
7276                 continue;
7277
7278             if( IS_INTRA(mb_type) ||
7279                 IS_INTRA(mbn_type) ) {
7280                 int value;
7281                 if (edge == 0) {
7282                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7283                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7284                     ) {
7285                         value = 4;
7286                     } else {
7287                         value = 3;
7288                     }
7289                 } else {
7290                     value = 3;
7291                 }
7292                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7293             } else {
7294                 int i, l;
7295                 int mv_done;
7296
7297                 if( edge & mask_edge ) {
7298                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7299                     mv_done = 1;
7300                 }
7301                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7302                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7303                     mv_done = 1;
7304                 }
7305                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7306                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7307                     int bn_idx= b_idx - (dir ? 8:1);
7308                     int v = 0;
7309                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7310                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7311                              ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7312                              ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7313                     }
7314                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7315                     mv_done = 1;
7316                 }
7317                 else
7318                     mv_done = 0;
7319
7320                 for( i = 0; i < 4; i++ ) {
7321                     int x = dir == 0 ? edge : i;
7322                     int y = dir == 0 ? i    : edge;
7323                     int b_idx= 8 + 4 + x + 8*y;
7324                     int bn_idx= b_idx - (dir ? 8:1);
7325
7326                     if( h->non_zero_count_cache[b_idx] != 0 ||
7327                         h->non_zero_count_cache[bn_idx] != 0 ) {
7328                         bS[i] = 2;
7329                     }
7330                     else if(!mv_done)
7331                     {
7332                         bS[i] = 0;
7333                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7334                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7335                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7336                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7337                                 bS[i] = 1;
7338                                 break;
7339                             }
7340                         }
7341                     }
7342                 }
7343
7344                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7345                     continue;
7346             }
7347
7348             /* Filter edge */
7349             // Do not use s->qscale as luma quantizer because it has not the same
7350             // value in IPCM macroblocks.
7351             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7352             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7353             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7354             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7355             if( dir == 0 ) {
7356                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7357                 if( (edge&1) == 0 ) {
7358                     int chroma_qp = ( h->chroma_qp +
7359                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7360                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7361                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7362                 }
7363             } else {
7364                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7365                 if( (edge&1) == 0 ) {
7366                     int chroma_qp = ( h->chroma_qp +
7367                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7368                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7369                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7370                 }
7371             }
7372         }
7373     }
7374 }
7375
7376 static int decode_slice(H264Context *h){
7377     MpegEncContext * const s = &h->s;
7378     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7379
7380     s->mb_skip_run= -1;
7381
7382     if( h->pps.cabac ) {
7383         int i;
7384
7385         /* realign */
7386         align_get_bits( &s->gb );
7387
7388         /* init cabac */
7389         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
7390         ff_init_cabac_decoder( &h->cabac,
7391                                s->gb.buffer + get_bits_count(&s->gb)/8,
7392                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7393         /* calculate pre-state */
7394         for( i= 0; i < 460; i++ ) {
7395             int pre;
7396             if( h->slice_type == I_TYPE )
7397                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7398             else
7399                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7400
7401             if( pre <= 63 )
7402                 h->cabac_state[i] = 2 * ( 63 - pre ) + 2;
7403             else
7404                 h->cabac_state[i] = 2 * ( pre - 64 ) + 3;
7405         }
7406
7407         for(;;){
7408             int ret = decode_mb_cabac(h);
7409             int eos;
7410
7411             if(ret>=0) hl_decode_mb(h);
7412
7413             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7414                 s->mb_y++;
7415
7416                 if(ret>=0) ret = decode_mb_cabac(h);
7417
7418                 if(ret>=0) hl_decode_mb(h);
7419                 s->mb_y--;
7420             }
7421             eos = get_cabac_terminate( &h->cabac );
7422
7423             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7424                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7425                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7426                 return -1;
7427             }
7428
7429             if( ++s->mb_x >= s->mb_width ) {
7430                 s->mb_x = 0;
7431                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7432                 ++s->mb_y;
7433                 if(FRAME_MBAFF) {
7434                     ++s->mb_y;
7435                 }
7436             }
7437
7438             if( eos || s->mb_y >= s->mb_height ) {
7439                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7440                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7441                 return 0;
7442             }
7443         }
7444
7445     } else {
7446         for(;;){
7447             int ret = decode_mb_cavlc(h);
7448
7449             if(ret>=0) hl_decode_mb(h);
7450
7451             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7452                 s->mb_y++;
7453                 ret = decode_mb_cavlc(h);
7454
7455                 if(ret>=0) hl_decode_mb(h);
7456                 s->mb_y--;
7457             }
7458
7459             if(ret<0){
7460                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7461                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7462
7463                 return -1;
7464             }
7465
7466             if(++s->mb_x >= s->mb_width){
7467                 s->mb_x=0;
7468                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7469                 ++s->mb_y;
7470                 if(FRAME_MBAFF) {
7471                     ++s->mb_y;
7472                 }
7473                 if(s->mb_y >= s->mb_height){
7474                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7475
7476                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7477                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7478
7479                         return 0;
7480                     }else{
7481                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7482
7483                         return -1;
7484                     }
7485                 }
7486             }
7487
7488             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7489                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7490                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7491                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7492
7493                     return 0;
7494                 }else{
7495                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7496
7497                     return -1;
7498                 }
7499             }
7500         }
7501     }
7502
7503 #if 0
7504     for(;s->mb_y < s->mb_height; s->mb_y++){
7505         for(;s->mb_x < s->mb_width; s->mb_x++){
7506             int ret= decode_mb(h);
7507
7508             hl_decode_mb(h);
7509
7510             if(ret<0){
7511                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7512                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7513
7514                 return -1;
7515             }
7516
7517             if(++s->mb_x >= s->mb_width){
7518                 s->mb_x=0;
7519                 if(++s->mb_y >= s->mb_height){
7520                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7521                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7522
7523                         return 0;
7524                     }else{
7525                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7526
7527                         return -1;
7528                     }
7529                 }
7530             }
7531
7532             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7533                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7534                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7535
7536                     return 0;
7537                 }else{
7538                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7539
7540                     return -1;
7541                 }
7542             }
7543         }
7544         s->mb_x=0;
7545         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7546     }
7547 #endif
7548     return -1; //not reached
7549 }
7550
7551 static int decode_unregistered_user_data(H264Context *h, int size){
7552     MpegEncContext * const s = &h->s;
7553     uint8_t user_data[16+256];
7554     int e, build, i;
7555
7556     if(size<16)
7557         return -1;
7558
7559     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7560         user_data[i]= get_bits(&s->gb, 8);
7561     }
7562
7563     user_data[i]= 0;
7564     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7565     if(e==1 && build>=0)
7566         h->x264_build= build;
7567
7568     if(s->avctx->debug & FF_DEBUG_BUGS)
7569         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7570
7571     for(; i<size; i++)
7572         skip_bits(&s->gb, 8);
7573
7574     return 0;
7575 }
7576
7577 static int decode_sei(H264Context *h){
7578     MpegEncContext * const s = &h->s;
7579
7580     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7581         int size, type;
7582
7583         type=0;
7584         do{
7585             type+= show_bits(&s->gb, 8);
7586         }while(get_bits(&s->gb, 8) == 255);
7587
7588         size=0;
7589         do{
7590             size+= show_bits(&s->gb, 8);
7591         }while(get_bits(&s->gb, 8) == 255);
7592
7593         switch(type){
7594         case 5:
7595             if(decode_unregistered_user_data(h, size) < 0)
7596                 return -1;
7597             break;
7598         default:
7599             skip_bits(&s->gb, 8*size);
7600         }
7601
7602         //FIXME check bits here
7603         align_get_bits(&s->gb);
7604     }
7605
7606     return 0;
7607 }
7608
7609 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7610     MpegEncContext * const s = &h->s;
7611     int cpb_count, i;
7612     cpb_count = get_ue_golomb(&s->gb) + 1;
7613     get_bits(&s->gb, 4); /* bit_rate_scale */
7614     get_bits(&s->gb, 4); /* cpb_size_scale */
7615     for(i=0; i<cpb_count; i++){
7616         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7617         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7618         get_bits1(&s->gb);     /* cbr_flag */
7619     }
7620     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7621     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7622     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7623     get_bits(&s->gb, 5); /* time_offset_length */
7624 }
7625
7626 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7627     MpegEncContext * const s = &h->s;
7628     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7629     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7630
7631     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7632
7633     if( aspect_ratio_info_present_flag ) {
7634         aspect_ratio_idc= get_bits(&s->gb, 8);
7635         if( aspect_ratio_idc == EXTENDED_SAR ) {
7636             sps->sar.num= get_bits(&s->gb, 16);
7637             sps->sar.den= get_bits(&s->gb, 16);
7638         }else if(aspect_ratio_idc < 14){
7639             sps->sar=  pixel_aspect[aspect_ratio_idc];
7640         }else{
7641             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7642             return -1;
7643         }
7644     }else{
7645         sps->sar.num=
7646         sps->sar.den= 0;
7647     }
7648 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7649
7650     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7651         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7652     }
7653
7654     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7655         get_bits(&s->gb, 3);    /* video_format */
7656         get_bits1(&s->gb);      /* video_full_range_flag */
7657         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7658             get_bits(&s->gb, 8); /* colour_primaries */
7659             get_bits(&s->gb, 8); /* transfer_characteristics */
7660             get_bits(&s->gb, 8); /* matrix_coefficients */
7661         }
7662     }
7663
7664     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7665         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7666         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7667     }
7668
7669     sps->timing_info_present_flag = get_bits1(&s->gb);
7670     if(sps->timing_info_present_flag){
7671         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7672         sps->time_scale = get_bits_long(&s->gb, 32);
7673         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7674     }
7675
7676     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7677     if(nal_hrd_parameters_present_flag)
7678         decode_hrd_parameters(h, sps);
7679     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7680     if(vcl_hrd_parameters_present_flag)
7681         decode_hrd_parameters(h, sps);
7682     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7683         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7684     get_bits1(&s->gb);         /* pic_struct_present_flag */
7685
7686     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7687     if(sps->bitstream_restriction_flag){
7688         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7689         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7690         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7691         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7692         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7693         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7694         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7695     }
7696
7697     return 0;
7698 }
7699
7700 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7701                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7702     MpegEncContext * const s = &h->s;
7703     int i, last = 8, next = 8;
7704     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7705     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7706         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7707     else
7708     for(i=0;i<size;i++){
7709         if(next)
7710             next = (last + get_se_golomb(&s->gb)) & 0xff;
7711         if(!i && !next){ /* matrix not written, we use the preset one */
7712             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7713             break;
7714         }
7715         last = factors[scan[i]] = next ? next : last;
7716     }
7717 }
7718
7719 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7720                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7721     MpegEncContext * const s = &h->s;
7722     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7723     const uint8_t *fallback[4] = {
7724         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7725         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7726         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7727         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7728     };
7729     if(get_bits1(&s->gb)){
7730         sps->scaling_matrix_present |= is_sps;
7731         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7732         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7733         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7734         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7735         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7736         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7737         if(is_sps || pps->transform_8x8_mode){
7738             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7739             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7740         }
7741     } else if(fallback_sps) {
7742         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7743         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7744     }
7745 }
7746
7747 static inline int decode_seq_parameter_set(H264Context *h){
7748     MpegEncContext * const s = &h->s;
7749     int profile_idc, level_idc;
7750     int sps_id, i;
7751     SPS *sps;
7752
7753     profile_idc= get_bits(&s->gb, 8);
7754     get_bits1(&s->gb);   //constraint_set0_flag
7755     get_bits1(&s->gb);   //constraint_set1_flag
7756     get_bits1(&s->gb);   //constraint_set2_flag
7757     get_bits1(&s->gb);   //constraint_set3_flag
7758     get_bits(&s->gb, 4); // reserved
7759     level_idc= get_bits(&s->gb, 8);
7760     sps_id= get_ue_golomb(&s->gb);
7761
7762     sps= &h->sps_buffer[ sps_id ];
7763     sps->profile_idc= profile_idc;
7764     sps->level_idc= level_idc;
7765
7766     if(sps->profile_idc >= 100){ //high profile
7767         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7768             get_bits1(&s->gb);  //residual_color_transform_flag
7769         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7770         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7771         sps->transform_bypass = get_bits1(&s->gb);
7772         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7773     }else
7774         sps->scaling_matrix_present = 0;
7775
7776     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7777     sps->poc_type= get_ue_golomb(&s->gb);
7778
7779     if(sps->poc_type == 0){ //FIXME #define
7780         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7781     } else if(sps->poc_type == 1){//FIXME #define
7782         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7783         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7784         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7785         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7786
7787         for(i=0; i<sps->poc_cycle_length; i++)
7788             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7789     }
7790     if(sps->poc_type > 2){
7791         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7792         return -1;
7793     }
7794
7795     sps->ref_frame_count= get_ue_golomb(&s->gb);
7796     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7797         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7798     }
7799     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7800     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7801     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7802     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7803        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7804         return -1;
7805
7806     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7807     if(!sps->frame_mbs_only_flag)
7808         sps->mb_aff= get_bits1(&s->gb);
7809     else
7810         sps->mb_aff= 0;
7811
7812     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7813
7814 #ifndef ALLOW_INTERLACE
7815     if(sps->mb_aff)
7816         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7817 #endif
7818     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7819         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7820
7821     sps->crop= get_bits1(&s->gb);
7822     if(sps->crop){
7823         sps->crop_left  = get_ue_golomb(&s->gb);
7824         sps->crop_right = get_ue_golomb(&s->gb);
7825         sps->crop_top   = get_ue_golomb(&s->gb);
7826         sps->crop_bottom= get_ue_golomb(&s->gb);
7827         if(sps->crop_left || sps->crop_top){
7828             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7829         }
7830     }else{
7831         sps->crop_left  =
7832         sps->crop_right =
7833         sps->crop_top   =
7834         sps->crop_bottom= 0;
7835     }
7836
7837     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7838     if( sps->vui_parameters_present_flag )
7839         decode_vui_parameters(h, sps);
7840
7841     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7842         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7843                sps_id, sps->profile_idc, sps->level_idc,
7844                sps->poc_type,
7845                sps->ref_frame_count,
7846                sps->mb_width, sps->mb_height,
7847                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7848                sps->direct_8x8_inference_flag ? "8B8" : "",
7849                sps->crop_left, sps->crop_right,
7850                sps->crop_top, sps->crop_bottom,
7851                sps->vui_parameters_present_flag ? "VUI" : ""
7852                );
7853     }
7854     return 0;
7855 }
7856
7857 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7858     MpegEncContext * const s = &h->s;
7859     int pps_id= get_ue_golomb(&s->gb);
7860     PPS *pps= &h->pps_buffer[pps_id];
7861
7862     pps->sps_id= get_ue_golomb(&s->gb);
7863     pps->cabac= get_bits1(&s->gb);
7864     pps->pic_order_present= get_bits1(&s->gb);
7865     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7866     if(pps->slice_group_count > 1 ){
7867         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7868         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7869         switch(pps->mb_slice_group_map_type){
7870         case 0:
7871 #if 0
7872 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7873 |    run_length[ i ]                                |1  |ue(v)   |
7874 #endif
7875             break;
7876         case 2:
7877 #if 0
7878 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7879 |{                                                  |   |        |
7880 |    top_left_mb[ i ]                               |1  |ue(v)   |
7881 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7882 |   }                                               |   |        |
7883 #endif
7884             break;
7885         case 3:
7886         case 4:
7887         case 5:
7888 #if 0
7889 |   slice_group_change_direction_flag               |1  |u(1)    |
7890 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7891 #endif
7892             break;
7893         case 6:
7894 #if 0
7895 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7896 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7897 |)                                                  |   |        |
7898 |    slice_group_id[ i ]                            |1  |u(v)    |
7899 #endif
7900             break;
7901         }
7902     }
7903     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7904     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7905     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7906         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7907         return -1;
7908     }
7909
7910     pps->weighted_pred= get_bits1(&s->gb);
7911     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7912     pps->init_qp= get_se_golomb(&s->gb) + 26;
7913     pps->init_qs= get_se_golomb(&s->gb) + 26;
7914     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7915     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7916     pps->constrained_intra_pred= get_bits1(&s->gb);
7917     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7918
7919     pps->transform_8x8_mode= 0;
7920     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7921     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7922     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7923
7924     if(get_bits_count(&s->gb) < bit_length){
7925         pps->transform_8x8_mode= get_bits1(&s->gb);
7926         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7927         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7928     }
7929
7930     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7931         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7932                pps_id, pps->sps_id,
7933                pps->cabac ? "CABAC" : "CAVLC",
7934                pps->slice_group_count,
7935                pps->ref_count[0], pps->ref_count[1],
7936                pps->weighted_pred ? "weighted" : "",
7937                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7938                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7939                pps->constrained_intra_pred ? "CONSTR" : "",
7940                pps->redundant_pic_cnt_present ? "REDU" : "",
7941                pps->transform_8x8_mode ? "8x8DCT" : ""
7942                );
7943     }
7944
7945     return 0;
7946 }
7947
7948 /**
7949  * finds the end of the current frame in the bitstream.
7950  * @return the position of the first byte of the next frame, or -1
7951  */
7952 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7953     int i;
7954     uint32_t state;
7955     ParseContext *pc = &(h->s.parse_context);
7956 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7957 //    mb_addr= pc->mb_addr - 1;
7958     state= pc->state;
7959     for(i=0; i<=buf_size; i++){
7960         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7961             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7962             if(pc->frame_start_found){
7963                 // If there isn't one more byte in the buffer
7964                 // the test on first_mb_in_slice cannot be done yet
7965                 // do it at next call.
7966                 if (i >= buf_size) break;
7967                 if (buf[i] & 0x80) {
7968                     // first_mb_in_slice is 0, probably the first nal of a new
7969                     // slice
7970                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7971                     pc->state=-1;
7972                     pc->frame_start_found= 0;
7973                     return i-4;
7974                 }
7975             }
7976             pc->frame_start_found = 1;
7977         }
7978         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7979            if(pc->frame_start_found){
7980                 pc->state=-1;
7981                 pc->frame_start_found= 0;
7982                 return i-4;
7983            }
7984         }
7985         if (i<buf_size)
7986             state= (state<<8) | buf[i];
7987     }
7988
7989     pc->state= state;
7990     return END_NOT_FOUND;
7991 }
7992
7993 #ifdef CONFIG_H264_PARSER
7994 static int h264_parse(AVCodecParserContext *s,
7995                       AVCodecContext *avctx,
7996                       uint8_t **poutbuf, int *poutbuf_size,
7997                       const uint8_t *buf, int buf_size)
7998 {
7999     H264Context *h = s->priv_data;
8000     ParseContext *pc = &h->s.parse_context;
8001     int next;
8002
8003     next= find_frame_end(h, buf, buf_size);
8004
8005     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
8006         *poutbuf = NULL;
8007         *poutbuf_size = 0;
8008         return buf_size;
8009     }
8010
8011     *poutbuf = (uint8_t *)buf;
8012     *poutbuf_size = buf_size;
8013     return next;
8014 }
8015
8016 static int h264_split(AVCodecContext *avctx,
8017                       const uint8_t *buf, int buf_size)
8018 {
8019     int i;
8020     uint32_t state = -1;
8021     int has_sps= 0;
8022
8023     for(i=0; i<=buf_size; i++){
8024         if((state&0xFFFFFF1F) == 0x107)
8025             has_sps=1;
8026 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
8027         }*/
8028         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
8029             if(has_sps){
8030                 while(i>4 && buf[i-5]==0) i--;
8031                 return i-4;
8032             }
8033         }
8034         if (i<buf_size)
8035             state= (state<<8) | buf[i];
8036     }
8037     return 0;
8038 }
8039 #endif /* CONFIG_H264_PARSER */
8040
8041 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
8042     MpegEncContext * const s = &h->s;
8043     AVCodecContext * const avctx= s->avctx;
8044     int buf_index=0;
8045 #if 0
8046     int i;
8047     for(i=0; i<50; i++){
8048         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
8049     }
8050 #endif
8051     h->slice_num = 0;
8052     s->current_picture_ptr= NULL;
8053     for(;;){
8054         int consumed;
8055         int dst_length;
8056         int bit_length;
8057         uint8_t *ptr;
8058         int i, nalsize = 0;
8059
8060       if(h->is_avc) {
8061         if(buf_index >= buf_size) break;
8062         nalsize = 0;
8063         for(i = 0; i < h->nal_length_size; i++)
8064             nalsize = (nalsize << 8) | buf[buf_index++];
8065         if(nalsize <= 1){
8066             if(nalsize == 1){
8067                 buf_index++;
8068                 continue;
8069             }else{
8070                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
8071                 break;
8072             }
8073         }
8074       } else {
8075         // start code prefix search
8076         for(; buf_index + 3 < buf_size; buf_index++){
8077             // this should allways succeed in the first iteration
8078             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
8079                 break;
8080         }
8081
8082         if(buf_index+3 >= buf_size) break;
8083
8084         buf_index+=3;
8085       }
8086
8087         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
8088         while(ptr[dst_length - 1] == 0 && dst_length > 1)
8089             dst_length--;
8090         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
8091
8092         if(s->avctx->debug&FF_DEBUG_STARTCODE){
8093             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
8094         }
8095
8096         if (h->is_avc && (nalsize != consumed))
8097             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
8098
8099         buf_index += consumed;
8100
8101         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
8102            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
8103             continue;
8104
8105         switch(h->nal_unit_type){
8106         case NAL_IDR_SLICE:
8107             idr(h); //FIXME ensure we don't loose some frames if there is reordering
8108         case NAL_SLICE:
8109             init_get_bits(&s->gb, ptr, bit_length);
8110             h->intra_gb_ptr=
8111             h->inter_gb_ptr= &s->gb;
8112             s->data_partitioning = 0;
8113
8114             if(decode_slice_header(h) < 0){
8115                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8116                 break;
8117             }
8118             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8119             if(h->redundant_pic_count==0 && s->hurry_up < 5
8120                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8121                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8122                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8123                && avctx->skip_frame < AVDISCARD_ALL)
8124                 decode_slice(h);
8125             break;
8126         case NAL_DPA:
8127             init_get_bits(&s->gb, ptr, bit_length);
8128             h->intra_gb_ptr=
8129             h->inter_gb_ptr= NULL;
8130             s->data_partitioning = 1;
8131
8132             if(decode_slice_header(h) < 0){
8133                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8134             }
8135             break;
8136         case NAL_DPB:
8137             init_get_bits(&h->intra_gb, ptr, bit_length);
8138             h->intra_gb_ptr= &h->intra_gb;
8139             break;
8140         case NAL_DPC:
8141             init_get_bits(&h->inter_gb, ptr, bit_length);
8142             h->inter_gb_ptr= &h->inter_gb;
8143
8144             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8145                && s->hurry_up < 5
8146                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8147                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8148                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8149                && avctx->skip_frame < AVDISCARD_ALL)
8150                 decode_slice(h);
8151             break;
8152         case NAL_SEI:
8153             init_get_bits(&s->gb, ptr, bit_length);
8154             decode_sei(h);
8155             break;
8156         case NAL_SPS:
8157             init_get_bits(&s->gb, ptr, bit_length);
8158             decode_seq_parameter_set(h);
8159
8160             if(s->flags& CODEC_FLAG_LOW_DELAY)
8161                 s->low_delay=1;
8162
8163             if(avctx->has_b_frames < 2)
8164                 avctx->has_b_frames= !s->low_delay;
8165             break;
8166         case NAL_PPS:
8167             init_get_bits(&s->gb, ptr, bit_length);
8168
8169             decode_picture_parameter_set(h, bit_length);
8170
8171             break;
8172         case NAL_AUD:
8173         case NAL_END_SEQUENCE:
8174         case NAL_END_STREAM:
8175         case NAL_FILLER_DATA:
8176         case NAL_SPS_EXT:
8177         case NAL_AUXILIARY_SLICE:
8178             break;
8179         default:
8180             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8181         }
8182     }
8183
8184     if(!s->current_picture_ptr) return buf_index; //no frame
8185
8186     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8187     s->current_picture_ptr->pict_type= s->pict_type;
8188
8189     h->prev_frame_num_offset= h->frame_num_offset;
8190     h->prev_frame_num= h->frame_num;
8191     if(s->current_picture_ptr->reference){
8192         h->prev_poc_msb= h->poc_msb;
8193         h->prev_poc_lsb= h->poc_lsb;
8194     }
8195     if(s->current_picture_ptr->reference)
8196         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8197
8198     ff_er_frame_end(s);
8199
8200     MPV_frame_end(s);
8201
8202     return buf_index;
8203 }
8204
8205 /**
8206  * returns the number of bytes consumed for building the current frame
8207  */
8208 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8209     if(s->flags&CODEC_FLAG_TRUNCATED){
8210         pos -= s->parse_context.last_index;
8211         if(pos<0) pos=0; // FIXME remove (unneeded?)
8212
8213         return pos;
8214     }else{
8215         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8216         if(pos+10>buf_size) pos=buf_size; // oops ;)
8217
8218         return pos;
8219     }
8220 }
8221
8222 static int decode_frame(AVCodecContext *avctx,
8223                              void *data, int *data_size,
8224                              uint8_t *buf, int buf_size)
8225 {
8226     H264Context *h = avctx->priv_data;
8227     MpegEncContext *s = &h->s;
8228     AVFrame *pict = data;
8229     int buf_index;
8230
8231     s->flags= avctx->flags;
8232     s->flags2= avctx->flags2;
8233
8234    /* no supplementary picture */
8235     if (buf_size == 0) {
8236         return 0;
8237     }
8238
8239     if(s->flags&CODEC_FLAG_TRUNCATED){
8240         int next= find_frame_end(h, buf, buf_size);
8241
8242         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8243             return buf_size;
8244 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8245     }
8246
8247     if(h->is_avc && !h->got_avcC) {
8248         int i, cnt, nalsize;
8249         unsigned char *p = avctx->extradata;
8250         if(avctx->extradata_size < 7) {
8251             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8252             return -1;
8253         }
8254         if(*p != 1) {
8255             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8256             return -1;
8257         }
8258         /* sps and pps in the avcC always have length coded with 2 bytes,
8259            so put a fake nal_length_size = 2 while parsing them */
8260         h->nal_length_size = 2;
8261         // Decode sps from avcC
8262         cnt = *(p+5) & 0x1f; // Number of sps
8263         p += 6;
8264         for (i = 0; i < cnt; i++) {
8265             nalsize = BE_16(p) + 2;
8266             if(decode_nal_units(h, p, nalsize) < 0) {
8267                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8268                 return -1;
8269             }
8270             p += nalsize;
8271         }
8272         // Decode pps from avcC
8273         cnt = *(p++); // Number of pps
8274         for (i = 0; i < cnt; i++) {
8275             nalsize = BE_16(p) + 2;
8276             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8277                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8278                 return -1;
8279             }
8280             p += nalsize;
8281         }
8282         // Now store right nal length size, that will be use to parse all other nals
8283         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8284         // Do not reparse avcC
8285         h->got_avcC = 1;
8286     }
8287
8288     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
8289         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8290             return -1;
8291     }
8292
8293     buf_index=decode_nal_units(h, buf, buf_size);
8294     if(buf_index < 0)
8295         return -1;
8296
8297     //FIXME do something with unavailable reference frames
8298
8299 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8300     if(!s->current_picture_ptr){
8301         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8302         return -1;
8303     }
8304
8305     {
8306         Picture *out = s->current_picture_ptr;
8307 #if 0 //decode order
8308         *data_size = sizeof(AVFrame);
8309 #else
8310         /* Sort B-frames into display order */
8311         Picture *cur = s->current_picture_ptr;
8312         Picture *prev = h->delayed_output_pic;
8313         int i, pics, cross_idr, out_of_order, out_idx;
8314
8315         if(h->sps.bitstream_restriction_flag
8316            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8317             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8318             s->low_delay = 0;
8319         }
8320
8321         pics = 0;
8322         while(h->delayed_pic[pics]) pics++;
8323         h->delayed_pic[pics++] = cur;
8324         if(cur->reference == 0)
8325             cur->reference = 1;
8326
8327         cross_idr = 0;
8328         for(i=0; h->delayed_pic[i]; i++)
8329             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8330                 cross_idr = 1;
8331
8332         out = h->delayed_pic[0];
8333         out_idx = 0;
8334         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8335             if(h->delayed_pic[i]->poc < out->poc){
8336                 out = h->delayed_pic[i];
8337                 out_idx = i;
8338             }
8339
8340         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8341         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8342             { }
8343         else if(prev && pics <= s->avctx->has_b_frames)
8344             out = prev;
8345         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8346            || (s->low_delay &&
8347             ((!cross_idr && prev && out->poc > prev->poc + 2)
8348              || cur->pict_type == B_TYPE)))
8349         {
8350             s->low_delay = 0;
8351             s->avctx->has_b_frames++;
8352             out = prev;
8353         }
8354         else if(out_of_order)
8355             out = prev;
8356
8357         if(out_of_order || pics > s->avctx->has_b_frames){
8358             for(i=out_idx; h->delayed_pic[i]; i++)
8359                 h->delayed_pic[i] = h->delayed_pic[i+1];
8360         }
8361
8362         if(prev == out)
8363             *data_size = 0;
8364         else
8365             *data_size = sizeof(AVFrame);
8366         if(prev && prev != out && prev->reference == 1)
8367             prev->reference = 0;
8368         h->delayed_output_pic = out;
8369 #endif
8370
8371         if(out)
8372             *pict= *(AVFrame*)out;
8373         else
8374             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8375     }
8376
8377     assert(pict->data[0] || !*data_size);
8378     ff_print_debug_info(s, pict);
8379 //printf("out %d\n", (int)pict->data[0]);
8380 #if 0 //?
8381
8382     /* Return the Picture timestamp as the frame number */
8383     /* we substract 1 because it is added on utils.c    */
8384     avctx->frame_number = s->picture_number - 1;
8385 #endif
8386     return get_consumed_bytes(s, buf_index, buf_size);
8387 }
8388 #if 0
8389 static inline void fill_mb_avail(H264Context *h){
8390     MpegEncContext * const s = &h->s;
8391     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8392
8393     if(s->mb_y){
8394         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8395         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8396         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8397     }else{
8398         h->mb_avail[0]=
8399         h->mb_avail[1]=
8400         h->mb_avail[2]= 0;
8401     }
8402     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8403     h->mb_avail[4]= 1; //FIXME move out
8404     h->mb_avail[5]= 0; //FIXME move out
8405 }
8406 #endif
8407
8408 #if 0 //selftest
8409 #define COUNT 8000
8410 #define SIZE (COUNT*40)
8411 int main(){
8412     int i;
8413     uint8_t temp[SIZE];
8414     PutBitContext pb;
8415     GetBitContext gb;
8416 //    int int_temp[10000];
8417     DSPContext dsp;
8418     AVCodecContext avctx;
8419
8420     dsputil_init(&dsp, &avctx);
8421
8422     init_put_bits(&pb, temp, SIZE);
8423     printf("testing unsigned exp golomb\n");
8424     for(i=0; i<COUNT; i++){
8425         START_TIMER
8426         set_ue_golomb(&pb, i);
8427         STOP_TIMER("set_ue_golomb");
8428     }
8429     flush_put_bits(&pb);
8430
8431     init_get_bits(&gb, temp, 8*SIZE);
8432     for(i=0; i<COUNT; i++){
8433         int j, s;
8434
8435         s= show_bits(&gb, 24);
8436
8437         START_TIMER
8438         j= get_ue_golomb(&gb);
8439         if(j != i){
8440             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8441 //            return -1;
8442         }
8443         STOP_TIMER("get_ue_golomb");
8444     }
8445
8446
8447     init_put_bits(&pb, temp, SIZE);
8448     printf("testing signed exp golomb\n");
8449     for(i=0; i<COUNT; i++){
8450         START_TIMER
8451         set_se_golomb(&pb, i - COUNT/2);
8452         STOP_TIMER("set_se_golomb");
8453     }
8454     flush_put_bits(&pb);
8455
8456     init_get_bits(&gb, temp, 8*SIZE);
8457     for(i=0; i<COUNT; i++){
8458         int j, s;
8459
8460         s= show_bits(&gb, 24);
8461
8462         START_TIMER
8463         j= get_se_golomb(&gb);
8464         if(j != i - COUNT/2){
8465             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8466 //            return -1;
8467         }
8468         STOP_TIMER("get_se_golomb");
8469     }
8470
8471     printf("testing 4x4 (I)DCT\n");
8472
8473     DCTELEM block[16];
8474     uint8_t src[16], ref[16];
8475     uint64_t error= 0, max_error=0;
8476
8477     for(i=0; i<COUNT; i++){
8478         int j;
8479 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8480         for(j=0; j<16; j++){
8481             ref[j]= random()%255;
8482             src[j]= random()%255;
8483         }
8484
8485         h264_diff_dct_c(block, src, ref, 4);
8486
8487         //normalize
8488         for(j=0; j<16; j++){
8489 //            printf("%d ", block[j]);
8490             block[j]= block[j]*4;
8491             if(j&1) block[j]= (block[j]*4 + 2)/5;
8492             if(j&4) block[j]= (block[j]*4 + 2)/5;
8493         }
8494 //        printf("\n");
8495
8496         s->dsp.h264_idct_add(ref, block, 4);
8497 /*        for(j=0; j<16; j++){
8498             printf("%d ", ref[j]);
8499         }
8500         printf("\n");*/
8501
8502         for(j=0; j<16; j++){
8503             int diff= ABS(src[j] - ref[j]);
8504
8505             error+= diff*diff;
8506             max_error= FFMAX(max_error, diff);
8507         }
8508     }
8509     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8510 #if 0
8511     printf("testing quantizer\n");
8512     for(qp=0; qp<52; qp++){
8513         for(i=0; i<16; i++)
8514             src1_block[i]= src2_block[i]= random()%255;
8515
8516     }
8517 #endif
8518     printf("Testing NAL layer\n");
8519
8520     uint8_t bitstream[COUNT];
8521     uint8_t nal[COUNT*2];
8522     H264Context h;
8523     memset(&h, 0, sizeof(H264Context));
8524
8525     for(i=0; i<COUNT; i++){
8526         int zeros= i;
8527         int nal_length;
8528         int consumed;
8529         int out_length;
8530         uint8_t *out;
8531         int j;
8532
8533         for(j=0; j<COUNT; j++){
8534             bitstream[j]= (random() % 255) + 1;
8535         }
8536
8537         for(j=0; j<zeros; j++){
8538             int pos= random() % COUNT;
8539             while(bitstream[pos] == 0){
8540                 pos++;
8541                 pos %= COUNT;
8542             }
8543             bitstream[pos]=0;
8544         }
8545
8546         START_TIMER
8547
8548         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8549         if(nal_length<0){
8550             printf("encoding failed\n");
8551             return -1;
8552         }
8553
8554         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8555
8556         STOP_TIMER("NAL")
8557
8558         if(out_length != COUNT){
8559             printf("incorrect length %d %d\n", out_length, COUNT);
8560             return -1;
8561         }
8562
8563         if(consumed != nal_length){
8564             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8565             return -1;
8566         }
8567
8568         if(memcmp(bitstream, out, COUNT)){
8569             printf("missmatch\n");
8570             return -1;
8571         }
8572     }
8573
8574     printf("Testing RBSP\n");
8575
8576
8577     return 0;
8578 }
8579 #endif
8580
8581
8582 static int decode_end(AVCodecContext *avctx)
8583 {
8584     H264Context *h = avctx->priv_data;
8585     MpegEncContext *s = &h->s;
8586
8587     av_freep(&h->rbsp_buffer);
8588     free_tables(h); //FIXME cleanup init stuff perhaps
8589     MPV_common_end(s);
8590
8591 //    memset(h, 0, sizeof(H264Context));
8592
8593     return 0;
8594 }
8595
8596
8597 AVCodec h264_decoder = {
8598     "h264",
8599     CODEC_TYPE_VIDEO,
8600     CODEC_ID_H264,
8601     sizeof(H264Context),
8602     decode_init,
8603     NULL,
8604     decode_end,
8605     decode_frame,
8606     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8607     .flush= flush_dpb,
8608 };
8609
8610 #ifdef CONFIG_H264_PARSER
8611 AVCodecParser h264_parser = {
8612     { CODEC_ID_H264 },
8613     sizeof(H264Context),
8614     NULL,
8615     h264_parse,
8616     ff_parse_close,
8617     h264_split,
8618 };
8619 #endif
8620
8621 #include "svq3.c"