git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "common.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264data.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 #define interlaced_dct interlaced_dct_is_a_bad_name
  42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  43
  44 #define LUMA_DC_BLOCK_INDEX   25
  45 #define CHROMA_DC_BLOCK_INDEX 26
  46
  47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  48 #define COEFF_TOKEN_VLC_BITS           8
  49 #define TOTAL_ZEROS_VLC_BITS           9
  50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  51 #define RUN_VLC_BITS                   3
  52 #define RUN7_VLC_BITS                  6
  53
  54 #define MAX_SPS_COUNT 32
  55 #define MAX_PPS_COUNT 256
  56
  57 #define MAX_MMCO_COUNT 66
  58
  59 /* Compiling in interlaced support reduces the speed
  60  * of progressive decoding by about 2%. */
  61 #define ALLOW_INTERLACE
  62
  63 #ifdef ALLOW_INTERLACE
  64 #define MB_MBAFF h->mb_mbaff
  65 #define MB_FIELD h->mb_field_decoding_flag
  66 #define FRAME_MBAFF h->mb_aff_frame
  67 #else
  68 #define MB_MBAFF 0
  69 #define MB_FIELD 0
  70 #define FRAME_MBAFF 0
  71 #undef  IS_INTERLACED
  72 #define IS_INTERLACED(mb_type) 0
  73 #endif
  74
  75 /**
  76  * Sequence parameter set
  77  */
  78 typedef struct SPS{
  79
  80     int profile_idc;
  81     int level_idc;
  82     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  83     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  84     int poc_type;                      ///< pic_order_cnt_type
  85     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  86     int delta_pic_order_always_zero_flag;
  87     int offset_for_non_ref_pic;
  88     int offset_for_top_to_bottom_field;
  89     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  90     int ref_frame_count;               ///< num_ref_frames
  91     int gaps_in_frame_num_allowed_flag;
  92     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  93     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  94     int frame_mbs_only_flag;
  95     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  96     int direct_8x8_inference_flag;
  97     int crop;                   ///< frame_cropping_flag
  98     int crop_left;              ///< frame_cropping_rect_left_offset
  99     int crop_right;             ///< frame_cropping_rect_right_offset
 100     int crop_top;               ///< frame_cropping_rect_top_offset
 101     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 102     int vui_parameters_present_flag;
 103     AVRational sar;
 104     int timing_info_present_flag;
 105     uint32_t num_units_in_tick;
 106     uint32_t time_scale;
 107     int fixed_frame_rate_flag;
 108     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 109     int bitstream_restriction_flag;
 110     int num_reorder_frames;
 111     int scaling_matrix_present;
 112     uint8_t scaling_matrix4[6][16];
 113     uint8_t scaling_matrix8[2][64];
 114 }SPS;
 115
 116 /**
 117  * Picture parameter set
 118  */
 119 typedef struct PPS{
 120     unsigned int sps_id;
 121     int cabac;                  ///< entropy_coding_mode_flag
 122     int pic_order_present;      ///< pic_order_present_flag
 123     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 124     int mb_slice_group_map_type;
 125     unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
 126     int weighted_pred;          ///< weighted_pred_flag
 127     int weighted_bipred_idc;
 128     int init_qp;                ///< pic_init_qp_minus26 + 26
 129     int init_qs;                ///< pic_init_qs_minus26 + 26
 130     int chroma_qp_index_offset;
 131     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 132     int constrained_intra_pred; ///< constrained_intra_pred_flag
 133     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 134     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 135     uint8_t scaling_matrix4[6][16];
 136     uint8_t scaling_matrix8[2][64];
 137 }PPS;
 138
 139 /**
 140  * Memory management control operation opcode.
 141  */
 142 typedef enum MMCOOpcode{
 143     MMCO_END=0,
 144     MMCO_SHORT2UNUSED,
 145     MMCO_LONG2UNUSED,
 146     MMCO_SHORT2LONG,
 147     MMCO_SET_MAX_LONG,
 148     MMCO_RESET,
 149     MMCO_LONG,
 150 } MMCOOpcode;
 151
 152 /**
 153  * Memory management control operation.
 154  */
 155 typedef struct MMCO{
 156     MMCOOpcode opcode;
 157     int short_frame_num;
 158     int long_index;
 159 } MMCO;
 160
 161 /**
 162  * H264Context
 163  */
 164 typedef struct H264Context{
 165     MpegEncContext s;
 166     int nal_ref_idc;
 167     int nal_unit_type;
 168     uint8_t *rbsp_buffer;
 169     unsigned int rbsp_buffer_size;
 170
 171     /**
 172       * Used to parse AVC variant of h264
 173       */
 174     int is_avc; ///< this flag is != 0 if codec is avc1
 175     int got_avcC; ///< flag used to parse avcC data only once
 176     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 177
 178     int chroma_qp; //QPc
 179
 180     int prev_mb_skipped;
 181     int next_mb_skipped;
 182
 183     //prediction stuff
 184     int chroma_pred_mode;
 185     int intra16x16_pred_mode;
 186
 187     int top_mb_xy;
 188     int left_mb_xy[2];
 189
 190     int8_t intra4x4_pred_mode_cache[5*8];
 191     int8_t (*intra4x4_pred_mode)[8];
 192     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 193     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 194     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 195     void (*pred16x16[4+3])(uint8_t *src, int stride);
 196     unsigned int topleft_samples_available;
 197     unsigned int top_samples_available;
 198     unsigned int topright_samples_available;
 199     unsigned int left_samples_available;
 200     uint8_t (*top_borders[2])[16+2*8];
 201     uint8_t left_border[2*(17+2*9)];
 202
 203     /**
 204      * non zero coeff count cache.
 205      * is 64 if not available.
 206      */
 207     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 208     uint8_t (*non_zero_count)[16];
 209
 210     /**
 211      * Motion vector cache.
 212      */
 213     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 214     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 215 #define LIST_NOT_USED -1 //FIXME rename?
 216 #define PART_NOT_AVAILABLE -2
 217
 218     /**
 219      * is 1 if the specific list MV&references are set to 0,0,-2.
 220      */
 221     int mv_cache_clean[2];
 222
 223     /**
 224      * number of neighbors (top and/or left) that used 8x8 dct
 225      */
 226     int neighbor_transform_size;
 227
 228     /**
 229      * block_offset[ 0..23] for frame macroblocks
 230      * block_offset[24..47] for field macroblocks
 231      */
 232     int block_offset[2*(16+8)];
 233
 234     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 235     uint32_t *mb2b8_xy;
 236     int b_stride; //FIXME use s->b4_stride
 237     int b8_stride;
 238
 239     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 240     int mb_uvlinesize;
 241
 242     int emu_edge_width;
 243     int emu_edge_height;
 244
 245     int halfpel_flag;
 246     int thirdpel_flag;
 247
 248     int unknown_svq3_flag;
 249     int next_slice_index;
 250
 251     SPS sps_buffer[MAX_SPS_COUNT];
 252     SPS sps; ///< current sps
 253
 254     PPS pps_buffer[MAX_PPS_COUNT];
 255     /**
 256      * current pps
 257      */
 258     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 259
 260     uint32_t dequant4_buffer[6][52][16];
 261     uint32_t dequant8_buffer[2][52][64];
 262     uint32_t (*dequant4_coeff[6])[16];
 263     uint32_t (*dequant8_coeff[2])[64];
 264     int dequant_coeff_pps;     ///< reinit tables when pps changes
 265
 266     int slice_num;
 267     uint8_t *slice_table_base;
 268     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 269     int slice_type;
 270     int slice_type_fixed;
 271
 272     //interlacing specific flags
 273     int mb_aff_frame;
 274     int mb_field_decoding_flag;
 275     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 276
 277     unsigned int sub_mb_type[4];
 278
 279     //POC stuff
 280     int poc_lsb;
 281     int poc_msb;
 282     int delta_poc_bottom;
 283     int delta_poc[2];
 284     int frame_num;
 285     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 286     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 287     int frame_num_offset;         ///< for POC type 2
 288     int prev_frame_num_offset;    ///< for POC type 2
 289     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 290
 291     /**
 292      * frame_num for frames or 2*frame_num for field pics.
 293      */
 294     int curr_pic_num;
 295
 296     /**
 297      * max_frame_num or 2*max_frame_num for field pics.
 298      */
 299     int max_pic_num;
 300
 301     //Weighted pred stuff
 302     int use_weight;
 303     int use_weight_chroma;
 304     int luma_log2_weight_denom;
 305     int chroma_log2_weight_denom;
 306     int luma_weight[2][48];
 307     int luma_offset[2][48];
 308     int chroma_weight[2][48][2];
 309     int chroma_offset[2][48][2];
 310     int implicit_weight[48][48];
 311
 312     //deblock
 313     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 314     int slice_alpha_c0_offset;
 315     int slice_beta_offset;
 316
 317     int redundant_pic_count;
 318
 319     int direct_spatial_mv_pred;
 320     int dist_scale_factor[16];
 321     int dist_scale_factor_field[32];
 322     int map_col_to_list0[2][16];
 323     int map_col_to_list0_field[2][32];
 324
 325     /**
 326      * num_ref_idx_l0/1_active_minus1 + 1
 327      */
 328     unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
 329     Picture *short_ref[32];
 330     Picture *long_ref[32];
 331     Picture default_ref_list[2][32];
 332     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 333     Picture *delayed_pic[18]; //FIXME size?
 334     Picture *delayed_output_pic;
 335
 336     /**
 337      * memory management control operations buffer.
 338      */
 339     MMCO mmco[MAX_MMCO_COUNT];
 340     int mmco_index;
 341
 342     int long_ref_count;  ///< number of actual long term references
 343     int short_ref_count; ///< number of actual short term references
 344
 345     //data partitioning
 346     GetBitContext intra_gb;
 347     GetBitContext inter_gb;
 348     GetBitContext *intra_gb_ptr;
 349     GetBitContext *inter_gb_ptr;
 350
 351     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 352     DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
 353
 354     /**
 355      * Cabac
 356      */
 357     CABACContext cabac;
 358     uint8_t      cabac_state[460];
 359     int          cabac_init_idc;
 360
 361     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 362     uint16_t     *cbp_table;
 363     int cbp;
 364     int top_cbp;
 365     int left_cbp;
 366     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 367     uint8_t     *chroma_pred_mode_table;
 368     int         last_qscale_diff;
 369     int16_t     (*mvd_table[2])[2];
 370     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 371     uint8_t     *direct_table;
 372     uint8_t     direct_cache[5*8];
 373
 374     uint8_t zigzag_scan[16];
 375     uint8_t zigzag_scan8x8[64];
 376     uint8_t zigzag_scan8x8_cavlc[64];
 377     uint8_t field_scan[16];
 378     uint8_t field_scan8x8[64];
 379     uint8_t field_scan8x8_cavlc[64];
 380     const uint8_t *zigzag_scan_q0;
 381     const uint8_t *zigzag_scan8x8_q0;
 382     const uint8_t *zigzag_scan8x8_cavlc_q0;
 383     const uint8_t *field_scan_q0;
 384     const uint8_t *field_scan8x8_q0;
 385     const uint8_t *field_scan8x8_cavlc_q0;
 386
 387     int x264_build;
 388 }H264Context;
 389
 390 static VLC coeff_token_vlc[4];
 391 static VLC chroma_dc_coeff_token_vlc;
 392
 393 static VLC total_zeros_vlc[15];
 394 static VLC chroma_dc_total_zeros_vlc[3];
 395
 396 static VLC run_vlc[6];
 397 static VLC run7_vlc;
 398
 399 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 400 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 401 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 402 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 403
 404 static av_always_inline uint32_t pack16to32(int a, int b){
 405 #ifdef WORDS_BIGENDIAN
 406    return (b&0xFFFF) + (a<<16);
 407 #else
 408    return (a&0xFFFF) + (b<<16);
 409 #endif
 410 }
 411
 412 const uint8_t ff_rem6[52]={
 413 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
 414 };
 415
 416 const uint8_t ff_div6[52]={
 417 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
 418 };
 419
 420
 421 /**
 422  * fill a rectangle.
 423  * @param h height of the rectangle, should be a constant
 424  * @param w width of the rectangle, should be a constant
 425  * @param size the size of val (1 or 4), should be a constant
 426  */
 427 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 428     uint8_t *p= (uint8_t*)vp;
 429     assert(size==1 || size==4);
 430     assert(w<=4);
 431
 432     w      *= size;
 433     stride *= size;
 434
 435     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 436     assert((stride&(w-1))==0);
 437     if(w==2){
 438         const uint16_t v= size==4 ? val : val*0x0101;
 439         *(uint16_t*)(p + 0*stride)= v;
 440         if(h==1) return;
 441         *(uint16_t*)(p + 1*stride)= v;
 442         if(h==2) return;
 443         *(uint16_t*)(p + 2*stride)=
 444         *(uint16_t*)(p + 3*stride)= v;
 445     }else if(w==4){
 446         const uint32_t v= size==4 ? val : val*0x01010101;
 447         *(uint32_t*)(p + 0*stride)= v;
 448         if(h==1) return;
 449         *(uint32_t*)(p + 1*stride)= v;
 450         if(h==2) return;
 451         *(uint32_t*)(p + 2*stride)=
 452         *(uint32_t*)(p + 3*stride)= v;
 453     }else if(w==8){
 454     //gcc can't optimize 64bit math on x86_32
 455 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 456         const uint64_t v= val*0x0100000001ULL;
 457         *(uint64_t*)(p + 0*stride)= v;
 458         if(h==1) return;
 459         *(uint64_t*)(p + 1*stride)= v;
 460         if(h==2) return;
 461         *(uint64_t*)(p + 2*stride)=
 462         *(uint64_t*)(p + 3*stride)= v;
 463     }else if(w==16){
 464         const uint64_t v= val*0x0100000001ULL;
 465         *(uint64_t*)(p + 0+0*stride)=
 466         *(uint64_t*)(p + 8+0*stride)=
 467         *(uint64_t*)(p + 0+1*stride)=
 468         *(uint64_t*)(p + 8+1*stride)= v;
 469         if(h==2) return;
 470         *(uint64_t*)(p + 0+2*stride)=
 471         *(uint64_t*)(p + 8+2*stride)=
 472         *(uint64_t*)(p + 0+3*stride)=
 473         *(uint64_t*)(p + 8+3*stride)= v;
 474 #else
 475         *(uint32_t*)(p + 0+0*stride)=
 476         *(uint32_t*)(p + 4+0*stride)= val;
 477         if(h==1) return;
 478         *(uint32_t*)(p + 0+1*stride)=
 479         *(uint32_t*)(p + 4+1*stride)= val;
 480         if(h==2) return;
 481         *(uint32_t*)(p + 0+2*stride)=
 482         *(uint32_t*)(p + 4+2*stride)=
 483         *(uint32_t*)(p + 0+3*stride)=
 484         *(uint32_t*)(p + 4+3*stride)= val;
 485     }else if(w==16){
 486         *(uint32_t*)(p + 0+0*stride)=
 487         *(uint32_t*)(p + 4+0*stride)=
 488         *(uint32_t*)(p + 8+0*stride)=
 489         *(uint32_t*)(p +12+0*stride)=
 490         *(uint32_t*)(p + 0+1*stride)=
 491         *(uint32_t*)(p + 4+1*stride)=
 492         *(uint32_t*)(p + 8+1*stride)=
 493         *(uint32_t*)(p +12+1*stride)= val;
 494         if(h==2) return;
 495         *(uint32_t*)(p + 0+2*stride)=
 496         *(uint32_t*)(p + 4+2*stride)=
 497         *(uint32_t*)(p + 8+2*stride)=
 498         *(uint32_t*)(p +12+2*stride)=
 499         *(uint32_t*)(p + 0+3*stride)=
 500         *(uint32_t*)(p + 4+3*stride)=
 501         *(uint32_t*)(p + 8+3*stride)=
 502         *(uint32_t*)(p +12+3*stride)= val;
 503 #endif
 504     }else
 505         assert(0);
 506     assert(h==4);
 507 }
 508
 509 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 510     MpegEncContext * const s = &h->s;
 511     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 512     int topleft_xy, top_xy, topright_xy, left_xy[2];
 513     int topleft_type, top_type, topright_type, left_type[2];
 514     int left_block[8];
 515     int i;
 516
 517     //FIXME deblocking could skip the intra and nnz parts.
 518     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 519         return;
 520
 521     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 522
 523     top_xy     = mb_xy  - s->mb_stride;
 524     topleft_xy = top_xy - 1;
 525     topright_xy= top_xy + 1;
 526     left_xy[1] = left_xy[0] = mb_xy-1;
 527     left_block[0]= 0;
 528     left_block[1]= 1;
 529     left_block[2]= 2;
 530     left_block[3]= 3;
 531     left_block[4]= 7;
 532     left_block[5]= 10;
 533     left_block[6]= 8;
 534     left_block[7]= 11;
 535     if(FRAME_MBAFF){
 536         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 537         const int top_pair_xy      = pair_xy     - s->mb_stride;
 538         const int topleft_pair_xy  = top_pair_xy - 1;
 539         const int topright_pair_xy = top_pair_xy + 1;
 540         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 541         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 542         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 543         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 544         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 545         const int bottom = (s->mb_y & 1);
 546         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 547         if (bottom
 548                 ? !curr_mb_frame_flag // bottom macroblock
 549                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 550                 ) {
 551             top_xy -= s->mb_stride;
 552         }
 553         if (bottom
 554                 ? !curr_mb_frame_flag // bottom macroblock
 555                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 556                 ) {
 557             topleft_xy -= s->mb_stride;
 558         }
 559         if (bottom
 560                 ? !curr_mb_frame_flag // bottom macroblock
 561                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 562                 ) {
 563             topright_xy -= s->mb_stride;
 564         }
 565         if (left_mb_frame_flag != curr_mb_frame_flag) {
 566             left_xy[1] = left_xy[0] = pair_xy - 1;
 567             if (curr_mb_frame_flag) {
 568                 if (bottom) {
 569                     left_block[0]= 2;
 570                     left_block[1]= 2;
 571                     left_block[2]= 3;
 572                     left_block[3]= 3;
 573                     left_block[4]= 8;
 574                     left_block[5]= 11;
 575                     left_block[6]= 8;
 576                     left_block[7]= 11;
 577                 } else {
 578                     left_block[0]= 0;
 579                     left_block[1]= 0;
 580                     left_block[2]= 1;
 581                     left_block[3]= 1;
 582                     left_block[4]= 7;
 583                     left_block[5]= 10;
 584                     left_block[6]= 7;
 585                     left_block[7]= 10;
 586                 }
 587             } else {
 588                 left_xy[1] += s->mb_stride;
 589                 //left_block[0]= 0;
 590                 left_block[1]= 2;
 591                 left_block[2]= 0;
 592                 left_block[3]= 2;
 593                 //left_block[4]= 7;
 594                 left_block[5]= 10;
 595                 left_block[6]= 7;
 596                 left_block[7]= 10;
 597             }
 598         }
 599     }
 600
 601     h->top_mb_xy = top_xy;
 602     h->left_mb_xy[0] = left_xy[0];
 603     h->left_mb_xy[1] = left_xy[1];
 604     if(for_deblock){
 605         topleft_type = 0;
 606         topright_type = 0;
 607         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 608         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 609         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 610
 611         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 612             int list;
 613             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 614             for(i=0; i<16; i++)
 615                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 616             for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 617                 if(USES_LIST(mb_type,list)){
 618                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 619                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 620                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 621                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 622                         dst[0] = src[0];
 623                         dst[1] = src[1];
 624                         dst[2] = src[2];
 625                         dst[3] = src[3];
 626                     }
 627                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 628                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 629                     ref += h->b8_stride;
 630                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 631                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 632                 }else{
 633                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 634                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 635                 }
 636             }
 637         }
 638     }else{
 639         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 640         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 641         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 642         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 643         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 644     }
 645
 646     if(IS_INTRA(mb_type)){
 647         h->topleft_samples_available=
 648         h->top_samples_available=
 649         h->left_samples_available= 0xFFFF;
 650         h->topright_samples_available= 0xEEEA;
 651
 652         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 653             h->topleft_samples_available= 0xB3FF;
 654             h->top_samples_available= 0x33FF;
 655             h->topright_samples_available= 0x26EA;
 656         }
 657         for(i=0; i<2; i++){
 658             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 659                 h->topleft_samples_available&= 0xDF5F;
 660                 h->left_samples_available&= 0x5F5F;
 661             }
 662         }
 663
 664         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 665             h->topleft_samples_available&= 0x7FFF;
 666
 667         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 668             h->topright_samples_available&= 0xFBFF;
 669
 670         if(IS_INTRA4x4(mb_type)){
 671             if(IS_INTRA4x4(top_type)){
 672                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 673                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 674                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 675                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 676             }else{
 677                 int pred;
 678                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 679                     pred= -1;
 680                 else{
 681                     pred= 2;
 682                 }
 683                 h->intra4x4_pred_mode_cache[4+8*0]=
 684                 h->intra4x4_pred_mode_cache[5+8*0]=
 685                 h->intra4x4_pred_mode_cache[6+8*0]=
 686                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 687             }
 688             for(i=0; i<2; i++){
 689                 if(IS_INTRA4x4(left_type[i])){
 690                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 691                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 692                 }else{
 693                     int pred;
 694                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 695                         pred= -1;
 696                     else{
 697                         pred= 2;
 698                     }
 699                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 700                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 701                 }
 702             }
 703         }
 704     }
 705
 706
 707 /*
 708 0 . T T. T T T T
 709 1 L . .L . . . .
 710 2 L . .L . . . .
 711 3 . T TL . . . .
 712 4 L . .L . . . .
 713 5 L . .. . . . .
 714 */
 715 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 716     if(top_type){
 717         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 718         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 719         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 720         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 721
 722         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 723         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 724
 725         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 726         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 727
 728     }else{
 729         h->non_zero_count_cache[4+8*0]=
 730         h->non_zero_count_cache[5+8*0]=
 731         h->non_zero_count_cache[6+8*0]=
 732         h->non_zero_count_cache[7+8*0]=
 733
 734         h->non_zero_count_cache[1+8*0]=
 735         h->non_zero_count_cache[2+8*0]=
 736
 737         h->non_zero_count_cache[1+8*3]=
 738         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 739
 740     }
 741
 742     for (i=0; i<2; i++) {
 743         if(left_type[i]){
 744             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 745             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 746             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 747             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 748         }else{
 749             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 750             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 751             h->non_zero_count_cache[0+8*1 +   8*i]=
 752             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 753         }
 754     }
 755
 756     if( h->pps.cabac ) {
 757         // top_cbp
 758         if(top_type) {
 759             h->top_cbp = h->cbp_table[top_xy];
 760         } else if(IS_INTRA(mb_type)) {
 761             h->top_cbp = 0x1C0;
 762         } else {
 763             h->top_cbp = 0;
 764         }
 765         // left_cbp
 766         if (left_type[0]) {
 767             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 768         } else if(IS_INTRA(mb_type)) {
 769             h->left_cbp = 0x1C0;
 770         } else {
 771             h->left_cbp = 0;
 772         }
 773         if (left_type[0]) {
 774             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 775         }
 776         if (left_type[1]) {
 777             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 778         }
 779     }
 780
 781 #if 1
 782     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 783         int list;
 784         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 785             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 786                 /*if(!h->mv_cache_clean[list]){
 787                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 788                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 789                     h->mv_cache_clean[list]= 1;
 790                 }*/
 791                 continue;
 792             }
 793             h->mv_cache_clean[list]= 0;
 794
 795             if(USES_LIST(top_type, list)){
 796                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 797                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 798                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 799                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 800                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 801                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 802                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 803                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 804                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 805                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 806             }else{
 807                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 808                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 809                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 810                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 811                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 812             }
 813
 814             //FIXME unify cleanup or sth
 815             if(USES_LIST(left_type[0], list)){
 816                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 817                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 818                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 819                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 820                 h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 821                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
 822             }else{
 823                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 824                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 825                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 826                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 827             }
 828
 829             if(USES_LIST(left_type[1], list)){
 830                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 831                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 832                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 833                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 834                 h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 835                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
 836             }else{
 837                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 838                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 839                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 840                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 841                 assert((!left_type[0]) == (!left_type[1]));
 842             }
 843
 844             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 845                 continue;
 846
 847             if(USES_LIST(topleft_type, list)){
 848                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 849                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 850                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 851                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 852             }else{
 853                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 854                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 855             }
 856
 857             if(USES_LIST(topright_type, list)){
 858                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 859                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 860                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 861                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 862             }else{
 863                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 864                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 865             }
 866
 867             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 868                 continue;
 869
 870             h->ref_cache[list][scan8[5 ]+1] =
 871             h->ref_cache[list][scan8[7 ]+1] =
 872             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 873             h->ref_cache[list][scan8[4 ]] =
 874             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 875             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 876             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 877             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 878             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 879             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 880
 881             if( h->pps.cabac ) {
 882                 /* XXX beurk, Load mvd */
 883                 if(USES_LIST(top_type, list)){
 884                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 885                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 886                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 887                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 888                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 889                 }else{
 890                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 891                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 892                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 893                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 894                 }
 895                 if(USES_LIST(left_type[0], list)){
 896                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 897                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 898                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 899                 }else{
 900                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 901                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 902                 }
 903                 if(USES_LIST(left_type[1], list)){
 904                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 905                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 906                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 907                 }else{
 908                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 909                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 910                 }
 911                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 912                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 913                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 914                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 915                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 916
 917                 if(h->slice_type == B_TYPE){
 918                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 919
 920                     if(IS_DIRECT(top_type)){
 921                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 922                     }else if(IS_8X8(top_type)){
 923                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 924                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 925                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 926                     }else{
 927                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 928                     }
 929
 930                     if(IS_DIRECT(left_type[0]))
 931                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 932                     else if(IS_8X8(left_type[0]))
 933                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 934                     else
 935                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 936
 937                     if(IS_DIRECT(left_type[1]))
 938                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 939                     else if(IS_8X8(left_type[1]))
 940                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 941                     else
 942                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 943                 }
 944             }
 945
 946             if(FRAME_MBAFF){
 947 #define MAP_MVS\
 948                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 949                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 950                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 951                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 952                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 953                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 954                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 955                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 956                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 957                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 958                 if(MB_FIELD){
 959 #define MAP_F2F(idx, mb_type)\
 960                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 961                         h->ref_cache[list][idx] <<= 1;\
 962                         h->mv_cache[list][idx][1] /= 2;\
 963                         h->mvd_cache[list][idx][1] /= 2;\
 964                     }
 965                     MAP_MVS
 966 #undef MAP_F2F
 967                 }else{
 968 #define MAP_F2F(idx, mb_type)\
 969                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 970                         h->ref_cache[list][idx] >>= 1;\
 971                         h->mv_cache[list][idx][1] <<= 1;\
 972                         h->mvd_cache[list][idx][1] <<= 1;\
 973                     }
 974                     MAP_MVS
 975 #undef MAP_F2F
 976                 }
 977             }
 978         }
 979     }
 980 #endif
 981
 982     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 983 }
 984
 985 static inline void write_back_intra_pred_mode(H264Context *h){
 986     MpegEncContext * const s = &h->s;
 987     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 988
 989     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 990     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 991     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 992     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 993     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 994     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 995     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 996 }
 997
 998 /**
 999  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1000  */
1001 static inline int check_intra4x4_pred_mode(H264Context *h){
1002     MpegEncContext * const s = &h->s;
1003     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1004     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1005     int i;
1006
1007     if(!(h->top_samples_available&0x8000)){
1008         for(i=0; i<4; i++){
1009             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1010             if(status<0){
1011                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1012                 return -1;
1013             } else if(status){
1014                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1015             }
1016         }
1017     }
1018
1019     if(!(h->left_samples_available&0x8000)){
1020         for(i=0; i<4; i++){
1021             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1022             if(status<0){
1023                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1024                 return -1;
1025             } else if(status){
1026                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1027             }
1028         }
1029     }
1030
1031     return 0;
1032 } //FIXME cleanup like next
1033
1034 /**
1035  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1036  */
1037 static inline int check_intra_pred_mode(H264Context *h, int mode){
1038     MpegEncContext * const s = &h->s;
1039     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1040     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1041
1042     if(mode > 6U) {
1043         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1044         return -1;
1045     }
1046
1047     if(!(h->top_samples_available&0x8000)){
1048         mode= top[ mode ];
1049         if(mode<0){
1050             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1051             return -1;
1052         }
1053     }
1054
1055     if(!(h->left_samples_available&0x8000)){
1056         mode= left[ mode ];
1057         if(mode<0){
1058             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1059             return -1;
1060         }
1061     }
1062
1063     return mode;
1064 }
1065
1066 /**
1067  * gets the predicted intra4x4 prediction mode.
1068  */
1069 static inline int pred_intra_mode(H264Context *h, int n){
1070     const int index8= scan8[n];
1071     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1072     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1073     const int min= FFMIN(left, top);
1074
1075     tprintf("mode:%d %d min:%d\n", left ,top, min);
1076
1077     if(min<0) return DC_PRED;
1078     else      return min;
1079 }
1080
1081 static inline void write_back_non_zero_count(H264Context *h){
1082     MpegEncContext * const s = &h->s;
1083     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1084
1085     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1086     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1087     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1088     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1089     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1090     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1091     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1092
1093     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1094     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1095     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1096
1097     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1098     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1099     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1100
1101     if(FRAME_MBAFF){
1102         // store all luma nnzs, for deblocking
1103         int v = 0, i;
1104         for(i=0; i<16; i++)
1105             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1106         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1107     }
1108 }
1109
1110 /**
1111  * gets the predicted number of non zero coefficients.
1112  * @param n block index
1113  */
1114 static inline int pred_non_zero_count(H264Context *h, int n){
1115     const int index8= scan8[n];
1116     const int left= h->non_zero_count_cache[index8 - 1];
1117     const int top = h->non_zero_count_cache[index8 - 8];
1118     int i= left + top;
1119
1120     if(i<64) i= (i+1)>>1;
1121
1122     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1123
1124     return i&31;
1125 }
1126
1127 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1128     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1129
1130     /* there is no consistent mapping of mvs to neighboring locations that will
1131      * make mbaff happy, so we can't move all this logic to fill_caches */
1132     if(FRAME_MBAFF){
1133         MpegEncContext *s = &h->s;
1134         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1135         const int16_t *mv;
1136         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1137         *C = h->mv_cache[list][scan8[0]-2];
1138
1139         if(!MB_FIELD
1140            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1141             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1142             if(IS_INTERLACED(mb_types[topright_xy])){
1143 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1144                 const int x4 = X4, y4 = Y4;\
1145                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1146                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1147                     return LIST_NOT_USED;\
1148                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1149                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1150                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1151                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1152
1153                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1154             }
1155         }
1156         if(topright_ref == PART_NOT_AVAILABLE
1157            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1158            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1159             if(!MB_FIELD
1160                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1161                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1162             }
1163             if(MB_FIELD
1164                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1165                && i >= scan8[0]+8){
1166                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1167                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1168             }
1169         }
1170 #undef SET_DIAG_MV
1171     }
1172
1173     if(topright_ref != PART_NOT_AVAILABLE){
1174         *C= h->mv_cache[list][ i - 8 + part_width ];
1175         return topright_ref;
1176     }else{
1177         tprintf("topright MV not available\n");
1178
1179         *C= h->mv_cache[list][ i - 8 - 1 ];
1180         return h->ref_cache[list][ i - 8 - 1 ];
1181     }
1182 }
1183
1184 /**
1185  * gets the predicted MV.
1186  * @param n the block index
1187  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1188  * @param mx the x component of the predicted motion vector
1189  * @param my the y component of the predicted motion vector
1190  */
1191 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1192     const int index8= scan8[n];
1193     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1194     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1195     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1196     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1197     const int16_t * C;
1198     int diagonal_ref, match_count;
1199
1200     assert(part_width==1 || part_width==2 || part_width==4);
1201
1202 /* mv_cache
1203   B . . A T T T T
1204   U . . L . . , .
1205   U . . L . . . .
1206   U . . L . . , .
1207   . . . L . . . .
1208 */
1209
1210     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1211     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1212     tprintf("pred_motion match_count=%d\n", match_count);
1213     if(match_count > 1){ //most common
1214         *mx= mid_pred(A[0], B[0], C[0]);
1215         *my= mid_pred(A[1], B[1], C[1]);
1216     }else if(match_count==1){
1217         if(left_ref==ref){
1218             *mx= A[0];
1219             *my= A[1];
1220         }else if(top_ref==ref){
1221             *mx= B[0];
1222             *my= B[1];
1223         }else{
1224             *mx= C[0];
1225             *my= C[1];
1226         }
1227     }else{
1228         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1229             *mx= A[0];
1230             *my= A[1];
1231         }else{
1232             *mx= mid_pred(A[0], B[0], C[0]);
1233             *my= mid_pred(A[1], B[1], C[1]);
1234         }
1235     }
1236
1237     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1238 }
1239
1240 /**
1241  * gets the directionally predicted 16x8 MV.
1242  * @param n the block index
1243  * @param mx the x component of the predicted motion vector
1244  * @param my the y component of the predicted motion vector
1245  */
1246 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1247     if(n==0){
1248         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1249         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1250
1251         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1252
1253         if(top_ref == ref){
1254             *mx= B[0];
1255             *my= B[1];
1256             return;
1257         }
1258     }else{
1259         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1260         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1261
1262         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1263
1264         if(left_ref == ref){
1265             *mx= A[0];
1266             *my= A[1];
1267             return;
1268         }
1269     }
1270
1271     //RARE
1272     pred_motion(h, n, 4, list, ref, mx, my);
1273 }
1274
1275 /**
1276  * gets the directionally predicted 8x16 MV.
1277  * @param n the block index
1278  * @param mx the x component of the predicted motion vector
1279  * @param my the y component of the predicted motion vector
1280  */
1281 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1282     if(n==0){
1283         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1284         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1285
1286         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1287
1288         if(left_ref == ref){
1289             *mx= A[0];
1290             *my= A[1];
1291             return;
1292         }
1293     }else{
1294         const int16_t * C;
1295         int diagonal_ref;
1296
1297         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1298
1299         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1300
1301         if(diagonal_ref == ref){
1302             *mx= C[0];
1303             *my= C[1];
1304             return;
1305         }
1306     }
1307
1308     //RARE
1309     pred_motion(h, n, 2, list, ref, mx, my);
1310 }
1311
1312 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1313     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1314     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1315
1316     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1317
1318     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1319        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1320        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1321
1322         *mx = *my = 0;
1323         return;
1324     }
1325
1326     pred_motion(h, 0, 4, 0, 0, mx, my);
1327
1328     return;
1329 }
1330
1331 static inline void direct_dist_scale_factor(H264Context * const h){
1332     const int poc = h->s.current_picture_ptr->poc;
1333     const int poc1 = h->ref_list[1][0].poc;
1334     int i;
1335     for(i=0; i<h->ref_count[0]; i++){
1336         int poc0 = h->ref_list[0][i].poc;
1337         int td = clip(poc1 - poc0, -128, 127);
1338         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1339             h->dist_scale_factor[i] = 256;
1340         }else{
1341             int tb = clip(poc - poc0, -128, 127);
1342             int tx = (16384 + (FFABS(td) >> 1)) / td;
1343             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1344         }
1345     }
1346     if(FRAME_MBAFF){
1347         for(i=0; i<h->ref_count[0]; i++){
1348             h->dist_scale_factor_field[2*i] =
1349             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1350         }
1351     }
1352 }
1353 static inline void direct_ref_list_init(H264Context * const h){
1354     MpegEncContext * const s = &h->s;
1355     Picture * const ref1 = &h->ref_list[1][0];
1356     Picture * const cur = s->current_picture_ptr;
1357     int list, i, j;
1358     if(cur->pict_type == I_TYPE)
1359         cur->ref_count[0] = 0;
1360     if(cur->pict_type != B_TYPE)
1361         cur->ref_count[1] = 0;
1362     for(list=0; list<2; list++){
1363         cur->ref_count[list] = h->ref_count[list];
1364         for(j=0; j<h->ref_count[list]; j++)
1365             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1366     }
1367     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1368         return;
1369     for(list=0; list<2; list++){
1370         for(i=0; i<ref1->ref_count[list]; i++){
1371             const int poc = ref1->ref_poc[list][i];
1372             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1373             for(j=0; j<h->ref_count[list]; j++)
1374                 if(h->ref_list[list][j].poc == poc){
1375                     h->map_col_to_list0[list][i] = j;
1376                     break;
1377                 }
1378         }
1379     }
1380     if(FRAME_MBAFF){
1381         for(list=0; list<2; list++){
1382             for(i=0; i<ref1->ref_count[list]; i++){
1383                 j = h->map_col_to_list0[list][i];
1384                 h->map_col_to_list0_field[list][2*i] = 2*j;
1385                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1386             }
1387         }
1388     }
1389 }
1390
1391 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1392     MpegEncContext * const s = &h->s;
1393     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1394     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1395     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1396     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1397     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1398     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1399     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1400     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1401     const int is_b8x8 = IS_8X8(*mb_type);
1402     unsigned int sub_mb_type;
1403     int i8, i4;
1404
1405 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1406     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1407         /* FIXME save sub mb types from previous frames (or derive from MVs)
1408          * so we know exactly what block size to use */
1409         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1410         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1411     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1412         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1413         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1414     }else{
1415         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1416         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1417     }
1418     if(!is_b8x8)
1419         *mb_type |= MB_TYPE_DIRECT2;
1420     if(MB_FIELD)
1421         *mb_type |= MB_TYPE_INTERLACED;
1422
1423     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1424
1425     if(h->direct_spatial_mv_pred){
1426         int ref[2];
1427         int mv[2][2];
1428         int list;
1429
1430         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1431
1432         /* ref = min(neighbors) */
1433         for(list=0; list<2; list++){
1434             int refa = h->ref_cache[list][scan8[0] - 1];
1435             int refb = h->ref_cache[list][scan8[0] - 8];
1436             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1437             if(refc == -2)
1438                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1439             ref[list] = refa;
1440             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1441                 ref[list] = refb;
1442             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1443                 ref[list] = refc;
1444             if(ref[list] < 0)
1445                 ref[list] = -1;
1446         }
1447
1448         if(ref[0] < 0 && ref[1] < 0){
1449             ref[0] = ref[1] = 0;
1450             mv[0][0] = mv[0][1] =
1451             mv[1][0] = mv[1][1] = 0;
1452         }else{
1453             for(list=0; list<2; list++){
1454                 if(ref[list] >= 0)
1455                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1456                 else
1457                     mv[list][0] = mv[list][1] = 0;
1458             }
1459         }
1460
1461         if(ref[1] < 0){
1462             *mb_type &= ~MB_TYPE_P0L1;
1463             sub_mb_type &= ~MB_TYPE_P0L1;
1464         }else if(ref[0] < 0){
1465             *mb_type &= ~MB_TYPE_P0L0;
1466             sub_mb_type &= ~MB_TYPE_P0L0;
1467         }
1468
1469         if(IS_16X16(*mb_type)){
1470             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1471             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1472             if(!IS_INTRA(mb_type_col)
1473                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1474                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1475                        && (h->x264_build>33 || !h->x264_build)))){
1476                 if(ref[0] > 0)
1477                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1478                 else
1479                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1480                 if(ref[1] > 0)
1481                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1482                 else
1483                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1484             }else{
1485                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1486                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1487             }
1488         }else{
1489             for(i8=0; i8<4; i8++){
1490                 const int x8 = i8&1;
1491                 const int y8 = i8>>1;
1492
1493                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1494                     continue;
1495                 h->sub_mb_type[i8] = sub_mb_type;
1496
1497                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1498                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1499                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1500                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1501
1502                 /* col_zero_flag */
1503                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1504                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1505                                                   && (h->x264_build>33 || !h->x264_build)))){
1506                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1507                     if(IS_SUB_8X8(sub_mb_type)){
1508                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1509                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1510                             if(ref[0] == 0)
1511                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1512                             if(ref[1] == 0)
1513                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1514                         }
1515                     }else
1516                     for(i4=0; i4<4; i4++){
1517                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1518                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1519                             if(ref[0] == 0)
1520                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1521                             if(ref[1] == 0)
1522                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1523                         }
1524                     }
1525                 }
1526             }
1527         }
1528     }else{ /* direct temporal mv pred */
1529         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1530         const int *dist_scale_factor = h->dist_scale_factor;
1531
1532         if(FRAME_MBAFF){
1533             if(IS_INTERLACED(*mb_type)){
1534                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1535                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1536                 dist_scale_factor = h->dist_scale_factor_field;
1537             }
1538             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1539                 /* FIXME assumes direct_8x8_inference == 1 */
1540                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1541                 int mb_types_col[2];
1542                 int y_shift;
1543
1544                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1545                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1546                          | (*mb_type & MB_TYPE_INTERLACED);
1547                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1548
1549                 if(IS_INTERLACED(*mb_type)){
1550                     /* frame to field scaling */
1551                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1552                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1553                     if(s->mb_y&1){
1554                         l1ref0 -= 2*h->b8_stride;
1555                         l1ref1 -= 2*h->b8_stride;
1556                         l1mv0 -= 4*h->b_stride;
1557                         l1mv1 -= 4*h->b_stride;
1558                     }
1559                     y_shift = 0;
1560
1561                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1562                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1563                        && !is_b8x8)
1564                         *mb_type |= MB_TYPE_16x8;
1565                     else
1566                         *mb_type |= MB_TYPE_8x8;
1567                 }else{
1568                     /* field to frame scaling */
1569                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1570                      * but in MBAFF, top and bottom POC are equal */
1571                     int dy = (s->mb_y&1) ? 1 : 2;
1572                     mb_types_col[0] =
1573                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1574                     l1ref0 += dy*h->b8_stride;
1575                     l1ref1 += dy*h->b8_stride;
1576                     l1mv0 += 2*dy*h->b_stride;
1577                     l1mv1 += 2*dy*h->b_stride;
1578                     y_shift = 2;
1579
1580                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1581                        && !is_b8x8)
1582                         *mb_type |= MB_TYPE_16x16;
1583                     else
1584                         *mb_type |= MB_TYPE_8x8;
1585                 }
1586
1587                 for(i8=0; i8<4; i8++){
1588                     const int x8 = i8&1;
1589                     const int y8 = i8>>1;
1590                     int ref0, scale;
1591                     const int16_t (*l1mv)[2]= l1mv0;
1592
1593                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1594                         continue;
1595                     h->sub_mb_type[i8] = sub_mb_type;
1596
1597                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1598                     if(IS_INTRA(mb_types_col[y8])){
1599                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1600                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1601                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1602                         continue;
1603                     }
1604
1605                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1606                     if(ref0 >= 0)
1607                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1608                     else{
1609                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1610                         l1mv= l1mv1;
1611                     }
1612                     scale = dist_scale_factor[ref0];
1613                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1614
1615                     {
1616                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1617                         int my_col = (mv_col[1]<<y_shift)/2;
1618                         int mx = (scale * mv_col[0] + 128) >> 8;
1619                         int my = (scale * my_col + 128) >> 8;
1620                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1621                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1622                     }
1623                 }
1624                 return;
1625             }
1626         }
1627
1628         /* one-to-one mv scaling */
1629
1630         if(IS_16X16(*mb_type)){
1631             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1632             if(IS_INTRA(mb_type_col)){
1633                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1634                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1635                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1636             }else{
1637                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1638                                                 : map_col_to_list0[1][l1ref1[0]];
1639                 const int scale = dist_scale_factor[ref0];
1640                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1641                 int mv_l0[2];
1642                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1643                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1644                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1645                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1646                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1647             }
1648         }else{
1649             for(i8=0; i8<4; i8++){
1650                 const int x8 = i8&1;
1651                 const int y8 = i8>>1;
1652                 int ref0, scale;
1653                 const int16_t (*l1mv)[2]= l1mv0;
1654
1655                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1656                     continue;
1657                 h->sub_mb_type[i8] = sub_mb_type;
1658                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1659                 if(IS_INTRA(mb_type_col)){
1660                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1661                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1662                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1663                     continue;
1664                 }
1665
1666                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1667                 if(ref0 >= 0)
1668                     ref0 = map_col_to_list0[0][ref0];
1669                 else{
1670                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1671                     l1mv= l1mv1;
1672                 }
1673                 scale = dist_scale_factor[ref0];
1674
1675                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1676                 if(IS_SUB_8X8(sub_mb_type)){
1677                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1678                     int mx = (scale * mv_col[0] + 128) >> 8;
1679                     int my = (scale * mv_col[1] + 128) >> 8;
1680                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1681                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1682                 }else
1683                 for(i4=0; i4<4; i4++){
1684                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1685                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1686                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1687                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1688                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1689                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1690                 }
1691             }
1692         }
1693     }
1694 }
1695
1696 static inline void write_back_motion(H264Context *h, int mb_type){
1697     MpegEncContext * const s = &h->s;
1698     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1699     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1700     int list;
1701
1702     if(!USES_LIST(mb_type, 0))
1703         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1704
1705     for(list=0; list<2; list++){
1706         int y;
1707         if(!USES_LIST(mb_type, list))
1708             continue;
1709
1710         for(y=0; y<4; y++){
1711             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1712             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1713         }
1714         if( h->pps.cabac ) {
1715             if(IS_SKIP(mb_type))
1716                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1717             else
1718             for(y=0; y<4; y++){
1719                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1720                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1721             }
1722         }
1723
1724         {
1725             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1726             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1727             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1728             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1729             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1730         }
1731     }
1732
1733     if(h->slice_type == B_TYPE && h->pps.cabac){
1734         if(IS_8X8(mb_type)){
1735             uint8_t *direct_table = &h->direct_table[b8_xy];
1736             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1737             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1738             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1739         }
1740     }
1741 }
1742
1743 /**
1744  * Decodes a network abstraction layer unit.
1745  * @param consumed is the number of bytes used as input
1746  * @param length is the length of the array
1747  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1748  * @returns decoded bytes, might be src+1 if no escapes
1749  */
1750 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1751     int i, si, di;
1752     uint8_t *dst;
1753
1754 //    src[0]&0x80;                //forbidden bit
1755     h->nal_ref_idc= src[0]>>5;
1756     h->nal_unit_type= src[0]&0x1F;
1757
1758     src++; length--;
1759 #if 0
1760     for(i=0; i<length; i++)
1761         printf("%2X ", src[i]);
1762 #endif
1763     for(i=0; i+1<length; i+=2){
1764         if(src[i]) continue;
1765         if(i>0 && src[i-1]==0) i--;
1766         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1767             if(src[i+2]!=3){
1768                 /* startcode, so we must be past the end */
1769                 length=i;
1770             }
1771             break;
1772         }
1773     }
1774
1775     if(i>=length-1){ //no escaped 0
1776         *dst_length= length;
1777         *consumed= length+1; //+1 for the header
1778         return src;
1779     }
1780
1781     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1782     dst= h->rbsp_buffer;
1783
1784     if (dst == NULL){
1785         return NULL;
1786     }
1787
1788 //printf("decoding esc\n");
1789     si=di=0;
1790     while(si<length){
1791         //remove escapes (very rare 1:2^22)
1792         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1793             if(src[si+2]==3){ //escape
1794                 dst[di++]= 0;
1795                 dst[di++]= 0;
1796                 si+=3;
1797                 continue;
1798             }else //next start code
1799                 break;
1800         }
1801
1802         dst[di++]= src[si++];
1803     }
1804
1805     *dst_length= di;
1806     *consumed= si + 1;//+1 for the header
1807 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1808     return dst;
1809 }
1810
1811 /**
1812  * identifies the exact end of the bitstream
1813  * @return the length of the trailing, or 0 if damaged
1814  */
1815 static int decode_rbsp_trailing(uint8_t *src){
1816     int v= *src;
1817     int r;
1818
1819     tprintf("rbsp trailing %X\n", v);
1820
1821     for(r=1; r<9; r++){
1822         if(v&1) return r;
1823         v>>=1;
1824     }
1825     return 0;
1826 }
1827
1828 /**
1829  * idct tranforms the 16 dc values and dequantize them.
1830  * @param qp quantization parameter
1831  */
1832 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1833 #define stride 16
1834     int i;
1835     int temp[16]; //FIXME check if this is a good idea
1836     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1837     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1838
1839 //memset(block, 64, 2*256);
1840 //return;
1841     for(i=0; i<4; i++){
1842         const int offset= y_offset[i];
1843         const int z0= block[offset+stride*0] + block[offset+stride*4];
1844         const int z1= block[offset+stride*0] - block[offset+stride*4];
1845         const int z2= block[offset+stride*1] - block[offset+stride*5];
1846         const int z3= block[offset+stride*1] + block[offset+stride*5];
1847
1848         temp[4*i+0]= z0+z3;
1849         temp[4*i+1]= z1+z2;
1850         temp[4*i+2]= z1-z2;
1851         temp[4*i+3]= z0-z3;
1852     }
1853
1854     for(i=0; i<4; i++){
1855         const int offset= x_offset[i];
1856         const int z0= temp[4*0+i] + temp[4*2+i];
1857         const int z1= temp[4*0+i] - temp[4*2+i];
1858         const int z2= temp[4*1+i] - temp[4*3+i];
1859         const int z3= temp[4*1+i] + temp[4*3+i];
1860
1861         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1862         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1863         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1864         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1865     }
1866 }
1867
1868 #if 0
1869 /**
1870  * dct tranforms the 16 dc values.
1871  * @param qp quantization parameter ??? FIXME
1872  */
1873 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1874 //    const int qmul= dequant_coeff[qp][0];
1875     int i;
1876     int temp[16]; //FIXME check if this is a good idea
1877     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1878     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1879
1880     for(i=0; i<4; i++){
1881         const int offset= y_offset[i];
1882         const int z0= block[offset+stride*0] + block[offset+stride*4];
1883         const int z1= block[offset+stride*0] - block[offset+stride*4];
1884         const int z2= block[offset+stride*1] - block[offset+stride*5];
1885         const int z3= block[offset+stride*1] + block[offset+stride*5];
1886
1887         temp[4*i+0]= z0+z3;
1888         temp[4*i+1]= z1+z2;
1889         temp[4*i+2]= z1-z2;
1890         temp[4*i+3]= z0-z3;
1891     }
1892
1893     for(i=0; i<4; i++){
1894         const int offset= x_offset[i];
1895         const int z0= temp[4*0+i] + temp[4*2+i];
1896         const int z1= temp[4*0+i] - temp[4*2+i];
1897         const int z2= temp[4*1+i] - temp[4*3+i];
1898         const int z3= temp[4*1+i] + temp[4*3+i];
1899
1900         block[stride*0 +offset]= (z0 + z3)>>1;
1901         block[stride*2 +offset]= (z1 + z2)>>1;
1902         block[stride*8 +offset]= (z1 - z2)>>1;
1903         block[stride*10+offset]= (z0 - z3)>>1;
1904     }
1905 }
1906 #endif
1907
1908 #undef xStride
1909 #undef stride
1910
1911 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1912     const int stride= 16*2;
1913     const int xStride= 16;
1914     int a,b,c,d,e;
1915
1916     a= block[stride*0 + xStride*0];
1917     b= block[stride*0 + xStride*1];
1918     c= block[stride*1 + xStride*0];
1919     d= block[stride*1 + xStride*1];
1920
1921     e= a-b;
1922     a= a+b;
1923     b= c-d;
1924     c= c+d;
1925
1926     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1927     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1928     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1929     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1930 }
1931
1932 #if 0
1933 static void chroma_dc_dct_c(DCTELEM *block){
1934     const int stride= 16*2;
1935     const int xStride= 16;
1936     int a,b,c,d,e;
1937
1938     a= block[stride*0 + xStride*0];
1939     b= block[stride*0 + xStride*1];
1940     c= block[stride*1 + xStride*0];
1941     d= block[stride*1 + xStride*1];
1942
1943     e= a-b;
1944     a= a+b;
1945     b= c-d;
1946     c= c+d;
1947
1948     block[stride*0 + xStride*0]= (a+c);
1949     block[stride*0 + xStride*1]= (e+b);
1950     block[stride*1 + xStride*0]= (a-c);
1951     block[stride*1 + xStride*1]= (e-b);
1952 }
1953 #endif
1954
1955 /**
1956  * gets the chroma qp.
1957  */
1958 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1959
1960     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1961 }
1962
1963 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1964 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1965 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1966     int i;
1967     const int * const quant_table= quant_coeff[qscale];
1968     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1969     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1970     const unsigned int threshold2= (threshold1<<1);
1971     int last_non_zero;
1972
1973     if(seperate_dc){
1974         if(qscale<=18){
1975             //avoid overflows
1976             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1977             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1978             const unsigned int dc_threshold2= (dc_threshold1<<1);
1979
1980             int level= block[0]*quant_coeff[qscale+18][0];
1981             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1982                 if(level>0){
1983                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1984                     block[0]= level;
1985                 }else{
1986                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1987                     block[0]= -level;
1988                 }
1989 //                last_non_zero = i;
1990             }else{
1991                 block[0]=0;
1992             }
1993         }else{
1994             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1995             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1996             const unsigned int dc_threshold2= (dc_threshold1<<1);
1997
1998             int level= block[0]*quant_table[0];
1999             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2000                 if(level>0){
2001                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
2002                     block[0]= level;
2003                 }else{
2004                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
2005                     block[0]= -level;
2006                 }
2007 //                last_non_zero = i;
2008             }else{
2009                 block[0]=0;
2010             }
2011         }
2012         last_non_zero= 0;
2013         i=1;
2014     }else{
2015         last_non_zero= -1;
2016         i=0;
2017     }
2018
2019     for(; i<16; i++){
2020         const int j= scantable[i];
2021         int level= block[j]*quant_table[j];
2022
2023 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2024 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2025         if(((unsigned)(level+threshold1))>threshold2){
2026             if(level>0){
2027                 level= (bias + level)>>QUANT_SHIFT;
2028                 block[j]= level;
2029             }else{
2030                 level= (bias - level)>>QUANT_SHIFT;
2031                 block[j]= -level;
2032             }
2033             last_non_zero = i;
2034         }else{
2035             block[j]=0;
2036         }
2037     }
2038
2039     return last_non_zero;
2040 }
2041
2042 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2043     const uint32_t a= ((uint32_t*)(src-stride))[0];
2044     ((uint32_t*)(src+0*stride))[0]= a;
2045     ((uint32_t*)(src+1*stride))[0]= a;
2046     ((uint32_t*)(src+2*stride))[0]= a;
2047     ((uint32_t*)(src+3*stride))[0]= a;
2048 }
2049
2050 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2051     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2052     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2053     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2054     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2055 }
2056
2057 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2058     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2059                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2060
2061     ((uint32_t*)(src+0*stride))[0]=
2062     ((uint32_t*)(src+1*stride))[0]=
2063     ((uint32_t*)(src+2*stride))[0]=
2064     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2065 }
2066
2067 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2068     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2069
2070     ((uint32_t*)(src+0*stride))[0]=
2071     ((uint32_t*)(src+1*stride))[0]=
2072     ((uint32_t*)(src+2*stride))[0]=
2073     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2074 }
2075
2076 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2077     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2078
2079     ((uint32_t*)(src+0*stride))[0]=
2080     ((uint32_t*)(src+1*stride))[0]=
2081     ((uint32_t*)(src+2*stride))[0]=
2082     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2083 }
2084
2085 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2086     ((uint32_t*)(src+0*stride))[0]=
2087     ((uint32_t*)(src+1*stride))[0]=
2088     ((uint32_t*)(src+2*stride))[0]=
2089     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2090 }
2091
2092
2093 #define LOAD_TOP_RIGHT_EDGE\
2094     const int t4= topright[0];\
2095     const int t5= topright[1];\
2096     const int t6= topright[2];\
2097     const int t7= topright[3];\
2098
2099 #define LOAD_LEFT_EDGE\
2100     const int l0= src[-1+0*stride];\
2101     const int l1= src[-1+1*stride];\
2102     const int l2= src[-1+2*stride];\
2103     const int l3= src[-1+3*stride];\
2104
2105 #define LOAD_TOP_EDGE\
2106     const int t0= src[ 0-1*stride];\
2107     const int t1= src[ 1-1*stride];\
2108     const int t2= src[ 2-1*stride];\
2109     const int t3= src[ 3-1*stride];\
2110
2111 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2112     const int lt= src[-1-1*stride];
2113     LOAD_TOP_EDGE
2114     LOAD_LEFT_EDGE
2115
2116     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2117     src[0+2*stride]=
2118     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2119     src[0+1*stride]=
2120     src[1+2*stride]=
2121     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2122     src[0+0*stride]=
2123     src[1+1*stride]=
2124     src[2+2*stride]=
2125     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2126     src[1+0*stride]=
2127     src[2+1*stride]=
2128     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2129     src[2+0*stride]=
2130     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2131     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2132 }
2133
2134 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2135     LOAD_TOP_EDGE
2136     LOAD_TOP_RIGHT_EDGE
2137 //    LOAD_LEFT_EDGE
2138
2139     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2140     src[1+0*stride]=
2141     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2142     src[2+0*stride]=
2143     src[1+1*stride]=
2144     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2145     src[3+0*stride]=
2146     src[2+1*stride]=
2147     src[1+2*stride]=
2148     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2149     src[3+1*stride]=
2150     src[2+2*stride]=
2151     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2152     src[3+2*stride]=
2153     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2154     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2155 }
2156
2157 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2158     const int lt= src[-1-1*stride];
2159     LOAD_TOP_EDGE
2160     LOAD_LEFT_EDGE
2161     const __attribute__((unused)) int unu= l3;
2162
2163     src[0+0*stride]=
2164     src[1+2*stride]=(lt + t0 + 1)>>1;
2165     src[1+0*stride]=
2166     src[2+2*stride]=(t0 + t1 + 1)>>1;
2167     src[2+0*stride]=
2168     src[3+2*stride]=(t1 + t2 + 1)>>1;
2169     src[3+0*stride]=(t2 + t3 + 1)>>1;
2170     src[0+1*stride]=
2171     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2172     src[1+1*stride]=
2173     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2174     src[2+1*stride]=
2175     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2176     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2177     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2178     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2179 }
2180
2181 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2182     LOAD_TOP_EDGE
2183     LOAD_TOP_RIGHT_EDGE
2184     const __attribute__((unused)) int unu= t7;
2185
2186     src[0+0*stride]=(t0 + t1 + 1)>>1;
2187     src[1+0*stride]=
2188     src[0+2*stride]=(t1 + t2 + 1)>>1;
2189     src[2+0*stride]=
2190     src[1+2*stride]=(t2 + t3 + 1)>>1;
2191     src[3+0*stride]=
2192     src[2+2*stride]=(t3 + t4+ 1)>>1;
2193     src[3+2*stride]=(t4 + t5+ 1)>>1;
2194     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2195     src[1+1*stride]=
2196     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2197     src[2+1*stride]=
2198     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2199     src[3+1*stride]=
2200     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2201     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2202 }
2203
2204 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2205     LOAD_LEFT_EDGE
2206
2207     src[0+0*stride]=(l0 + l1 + 1)>>1;
2208     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2209     src[2+0*stride]=
2210     src[0+1*stride]=(l1 + l2 + 1)>>1;
2211     src[3+0*stride]=
2212     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2213     src[2+1*stride]=
2214     src[0+2*stride]=(l2 + l3 + 1)>>1;
2215     src[3+1*stride]=
2216     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2217     src[3+2*stride]=
2218     src[1+3*stride]=
2219     src[0+3*stride]=
2220     src[2+2*stride]=
2221     src[2+3*stride]=
2222     src[3+3*stride]=l3;
2223 }
2224
2225 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2226     const int lt= src[-1-1*stride];
2227     LOAD_TOP_EDGE
2228     LOAD_LEFT_EDGE
2229     const __attribute__((unused)) int unu= t3;
2230
2231     src[0+0*stride]=
2232     src[2+1*stride]=(lt + l0 + 1)>>1;
2233     src[1+0*stride]=
2234     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2235     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2236     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2237     src[0+1*stride]=
2238     src[2+2*stride]=(l0 + l1 + 1)>>1;
2239     src[1+1*stride]=
2240     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2241     src[0+2*stride]=
2242     src[2+3*stride]=(l1 + l2+ 1)>>1;
2243     src[1+2*stride]=
2244     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2245     src[0+3*stride]=(l2 + l3 + 1)>>1;
2246     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2247 }
2248
2249 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2250     int i;
2251     const uint32_t a= ((uint32_t*)(src-stride))[0];
2252     const uint32_t b= ((uint32_t*)(src-stride))[1];
2253     const uint32_t c= ((uint32_t*)(src-stride))[2];
2254     const uint32_t d= ((uint32_t*)(src-stride))[3];
2255
2256     for(i=0; i<16; i++){
2257         ((uint32_t*)(src+i*stride))[0]= a;
2258         ((uint32_t*)(src+i*stride))[1]= b;
2259         ((uint32_t*)(src+i*stride))[2]= c;
2260         ((uint32_t*)(src+i*stride))[3]= d;
2261     }
2262 }
2263
2264 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2265     int i;
2266
2267     for(i=0; i<16; i++){
2268         ((uint32_t*)(src+i*stride))[0]=
2269         ((uint32_t*)(src+i*stride))[1]=
2270         ((uint32_t*)(src+i*stride))[2]=
2271         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2272     }
2273 }
2274
2275 void ff_pred16x16_dc_c(uint8_t *src, int stride){
2276     int i, dc=0;
2277
2278     for(i=0;i<16; i++){
2279         dc+= src[-1+i*stride];
2280     }
2281
2282     for(i=0;i<16; i++){
2283         dc+= src[i-stride];
2284     }
2285
2286     dc= 0x01010101*((dc + 16)>>5);
2287
2288     for(i=0; i<16; i++){
2289         ((uint32_t*)(src+i*stride))[0]=
2290         ((uint32_t*)(src+i*stride))[1]=
2291         ((uint32_t*)(src+i*stride))[2]=
2292         ((uint32_t*)(src+i*stride))[3]= dc;
2293     }
2294 }
2295
2296 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2297     int i, dc=0;
2298
2299     for(i=0;i<16; i++){
2300         dc+= src[-1+i*stride];
2301     }
2302
2303     dc= 0x01010101*((dc + 8)>>4);
2304
2305     for(i=0; i<16; i++){
2306         ((uint32_t*)(src+i*stride))[0]=
2307         ((uint32_t*)(src+i*stride))[1]=
2308         ((uint32_t*)(src+i*stride))[2]=
2309         ((uint32_t*)(src+i*stride))[3]= dc;
2310     }
2311 }
2312
2313 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2314     int i, dc=0;
2315
2316     for(i=0;i<16; i++){
2317         dc+= src[i-stride];
2318     }
2319     dc= 0x01010101*((dc + 8)>>4);
2320
2321     for(i=0; i<16; i++){
2322         ((uint32_t*)(src+i*stride))[0]=
2323         ((uint32_t*)(src+i*stride))[1]=
2324         ((uint32_t*)(src+i*stride))[2]=
2325         ((uint32_t*)(src+i*stride))[3]= dc;
2326     }
2327 }
2328
2329 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2330     int i;
2331
2332     for(i=0; i<16; i++){
2333         ((uint32_t*)(src+i*stride))[0]=
2334         ((uint32_t*)(src+i*stride))[1]=
2335         ((uint32_t*)(src+i*stride))[2]=
2336         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2337     }
2338 }
2339
2340 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2341   int i, j, k;
2342   int a;
2343   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2344   const uint8_t * const src0 = src+7-stride;
2345   const uint8_t *src1 = src+8*stride-1;
2346   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2347   int H = src0[1] - src0[-1];
2348   int V = src1[0] - src2[ 0];
2349   for(k=2; k<=8; ++k) {
2350     src1 += stride; src2 -= stride;
2351     H += k*(src0[k] - src0[-k]);
2352     V += k*(src1[0] - src2[ 0]);
2353   }
2354   if(svq3){
2355     H = ( 5*(H/4) ) / 16;
2356     V = ( 5*(V/4) ) / 16;
2357
2358     /* required for 100% accuracy */
2359     i = H; H = V; V = i;
2360   }else{
2361     H = ( 5*H+32 ) >> 6;
2362     V = ( 5*V+32 ) >> 6;
2363   }
2364
2365   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2366   for(j=16; j>0; --j) {
2367     int b = a;
2368     a += V;
2369     for(i=-16; i<0; i+=4) {
2370       src[16+i] = cm[ (b    ) >> 5 ];
2371       src[17+i] = cm[ (b+  H) >> 5 ];
2372       src[18+i] = cm[ (b+2*H) >> 5 ];
2373       src[19+i] = cm[ (b+3*H) >> 5 ];
2374       b += 4*H;
2375     }
2376     src += stride;
2377   }
2378 }
2379
2380 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2381     pred16x16_plane_compat_c(src, stride, 0);
2382 }
2383
2384 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2385     int i;
2386     const uint32_t a= ((uint32_t*)(src-stride))[0];
2387     const uint32_t b= ((uint32_t*)(src-stride))[1];
2388
2389     for(i=0; i<8; i++){
2390         ((uint32_t*)(src+i*stride))[0]= a;
2391         ((uint32_t*)(src+i*stride))[1]= b;
2392     }
2393 }
2394
2395 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2396     int i;
2397
2398     for(i=0; i<8; i++){
2399         ((uint32_t*)(src+i*stride))[0]=
2400         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2401     }
2402 }
2403
2404 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2405     int i;
2406
2407     for(i=0; i<8; i++){
2408         ((uint32_t*)(src+i*stride))[0]=
2409         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2410     }
2411 }
2412
2413 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2414     int i;
2415     int dc0, dc2;
2416
2417     dc0=dc2=0;
2418     for(i=0;i<4; i++){
2419         dc0+= src[-1+i*stride];
2420         dc2+= src[-1+(i+4)*stride];
2421     }
2422     dc0= 0x01010101*((dc0 + 2)>>2);
2423     dc2= 0x01010101*((dc2 + 2)>>2);
2424
2425     for(i=0; i<4; i++){
2426         ((uint32_t*)(src+i*stride))[0]=
2427         ((uint32_t*)(src+i*stride))[1]= dc0;
2428     }
2429     for(i=4; i<8; i++){
2430         ((uint32_t*)(src+i*stride))[0]=
2431         ((uint32_t*)(src+i*stride))[1]= dc2;
2432     }
2433 }
2434
2435 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2436     int i;
2437     int dc0, dc1;
2438
2439     dc0=dc1=0;
2440     for(i=0;i<4; i++){
2441         dc0+= src[i-stride];
2442         dc1+= src[4+i-stride];
2443     }
2444     dc0= 0x01010101*((dc0 + 2)>>2);
2445     dc1= 0x01010101*((dc1 + 2)>>2);
2446
2447     for(i=0; i<4; i++){
2448         ((uint32_t*)(src+i*stride))[0]= dc0;
2449         ((uint32_t*)(src+i*stride))[1]= dc1;
2450     }
2451     for(i=4; i<8; i++){
2452         ((uint32_t*)(src+i*stride))[0]= dc0;
2453         ((uint32_t*)(src+i*stride))[1]= dc1;
2454     }
2455 }
2456
2457
2458 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2459     int i;
2460     int dc0, dc1, dc2, dc3;
2461
2462     dc0=dc1=dc2=0;
2463     for(i=0;i<4; i++){
2464         dc0+= src[-1+i*stride] + src[i-stride];
2465         dc1+= src[4+i-stride];
2466         dc2+= src[-1+(i+4)*stride];
2467     }
2468     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2469     dc0= 0x01010101*((dc0 + 4)>>3);
2470     dc1= 0x01010101*((dc1 + 2)>>2);
2471     dc2= 0x01010101*((dc2 + 2)>>2);
2472
2473     for(i=0; i<4; i++){
2474         ((uint32_t*)(src+i*stride))[0]= dc0;
2475         ((uint32_t*)(src+i*stride))[1]= dc1;
2476     }
2477     for(i=4; i<8; i++){
2478         ((uint32_t*)(src+i*stride))[0]= dc2;
2479         ((uint32_t*)(src+i*stride))[1]= dc3;
2480     }
2481 }
2482
2483 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2484   int j, k;
2485   int a;
2486   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2487   const uint8_t * const src0 = src+3-stride;
2488   const uint8_t *src1 = src+4*stride-1;
2489   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2490   int H = src0[1] - src0[-1];
2491   int V = src1[0] - src2[ 0];
2492   for(k=2; k<=4; ++k) {
2493     src1 += stride; src2 -= stride;
2494     H += k*(src0[k] - src0[-k]);
2495     V += k*(src1[0] - src2[ 0]);
2496   }
2497   H = ( 17*H+16 ) >> 5;
2498   V = ( 17*V+16 ) >> 5;
2499
2500   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2501   for(j=8; j>0; --j) {
2502     int b = a;
2503     a += V;
2504     src[0] = cm[ (b    ) >> 5 ];
2505     src[1] = cm[ (b+  H) >> 5 ];
2506     src[2] = cm[ (b+2*H) >> 5 ];
2507     src[3] = cm[ (b+3*H) >> 5 ];
2508     src[4] = cm[ (b+4*H) >> 5 ];
2509     src[5] = cm[ (b+5*H) >> 5 ];
2510     src[6] = cm[ (b+6*H) >> 5 ];
2511     src[7] = cm[ (b+7*H) >> 5 ];
2512     src += stride;
2513   }
2514 }
2515
2516 #define SRC(x,y) src[(x)+(y)*stride]
2517 #define PL(y) \
2518     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2519 #define PREDICT_8x8_LOAD_LEFT \
2520     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2521                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2522     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2523     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2524
2525 #define PT(x) \
2526     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2527 #define PREDICT_8x8_LOAD_TOP \
2528     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2529                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2530     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2531     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2532                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2533
2534 #define PTR(x) \
2535     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2536 #define PREDICT_8x8_LOAD_TOPRIGHT \
2537     int t8, t9, t10, t11, t12, t13, t14, t15; \
2538     if(has_topright) { \
2539         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2540         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2541     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2542
2543 #define PREDICT_8x8_LOAD_TOPLEFT \
2544     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2545
2546 #define PREDICT_8x8_DC(v) \
2547     int y; \
2548     for( y = 0; y < 8; y++ ) { \
2549         ((uint32_t*)src)[0] = \
2550         ((uint32_t*)src)[1] = v; \
2551         src += stride; \
2552     }
2553
2554 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2555 {
2556     PREDICT_8x8_DC(0x80808080);
2557 }
2558 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2559 {
2560     PREDICT_8x8_LOAD_LEFT;
2561     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2562     PREDICT_8x8_DC(dc);
2563 }
2564 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2565 {
2566     PREDICT_8x8_LOAD_TOP;
2567     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2568     PREDICT_8x8_DC(dc);
2569 }
2570 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2571 {
2572     PREDICT_8x8_LOAD_LEFT;
2573     PREDICT_8x8_LOAD_TOP;
2574     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2575                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2576     PREDICT_8x8_DC(dc);
2577 }
2578 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2579 {
2580     PREDICT_8x8_LOAD_LEFT;
2581 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2582                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2583     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2584 #undef ROW
2585 }
2586 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2587 {
2588     int y;
2589     PREDICT_8x8_LOAD_TOP;
2590     src[0] = t0;
2591     src[1] = t1;
2592     src[2] = t2;
2593     src[3] = t3;
2594     src[4] = t4;
2595     src[5] = t5;
2596     src[6] = t6;
2597     src[7] = t7;
2598     for( y = 1; y < 8; y++ )
2599         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2600 }
2601 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2602 {
2603     PREDICT_8x8_LOAD_TOP;
2604     PREDICT_8x8_LOAD_TOPRIGHT;
2605     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2606     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2607     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2608     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2609     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2610     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2611     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2612     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2613     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2614     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2615     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2616     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2617     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2618     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2619     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2620 }
2621 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2622 {
2623     PREDICT_8x8_LOAD_TOP;
2624     PREDICT_8x8_LOAD_LEFT;
2625     PREDICT_8x8_LOAD_TOPLEFT;
2626     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2627     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2628     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2629     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2630     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2631     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2632     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2633     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2634     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2635     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2636     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2637     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2638     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2639     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2640     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2641
2642 }
2643 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2644 {
2645     PREDICT_8x8_LOAD_TOP;
2646     PREDICT_8x8_LOAD_LEFT;
2647     PREDICT_8x8_LOAD_TOPLEFT;
2648     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2649     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2650     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2651     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2652     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2653     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2654     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2655     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2656     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2657     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2658     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2659     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2660     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2661     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2662     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2663     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2664     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2665     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2666     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2667     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2668     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2669     SRC(7,0)= (t6 + t7 + 1) >> 1;
2670 }
2671 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2672 {
2673     PREDICT_8x8_LOAD_TOP;
2674     PREDICT_8x8_LOAD_LEFT;
2675     PREDICT_8x8_LOAD_TOPLEFT;
2676     SRC(0,7)= (l6 + l7 + 1) >> 1;
2677     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2678     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2679     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2680     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2681     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2682     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2683     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2684     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2685     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2686     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2687     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2688     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2689     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2690     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2691     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2692     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2693     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2694     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2695     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2696     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2697     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2698 }
2699 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2700 {
2701     PREDICT_8x8_LOAD_TOP;
2702     PREDICT_8x8_LOAD_TOPRIGHT;
2703     SRC(0,0)= (t0 + t1 + 1) >> 1;
2704     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2705     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2706     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2707     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2708     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2709     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2710     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2711     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2712     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2713     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2714     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2715     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2716     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2717     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2718     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2719     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2720     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2721     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2722     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2723     SRC(7,6)= (t10 + t11 + 1) >> 1;
2724     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2725 }
2726 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2727 {
2728     PREDICT_8x8_LOAD_LEFT;
2729     SRC(0,0)= (l0 + l1 + 1) >> 1;
2730     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2731     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2732     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2733     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2734     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2735     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2736     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2737     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2738     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2739     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2740     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2741     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2742     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2743     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2744     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2745     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2746     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2747 }
2748 #undef PREDICT_8x8_LOAD_LEFT
2749 #undef PREDICT_8x8_LOAD_TOP
2750 #undef PREDICT_8x8_LOAD_TOPLEFT
2751 #undef PREDICT_8x8_LOAD_TOPRIGHT
2752 #undef PREDICT_8x8_DC
2753 #undef PTR
2754 #undef PT
2755 #undef PL
2756 #undef SRC
2757
2758 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2759                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2760                            int src_x_offset, int src_y_offset,
2761                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2762     MpegEncContext * const s = &h->s;
2763     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2764     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2765     const int luma_xy= (mx&3) + ((my&3)<<2);
2766     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2767     uint8_t * src_cb, * src_cr;
2768     int extra_width= h->emu_edge_width;
2769     int extra_height= h->emu_edge_height;
2770     int emu=0;
2771     const int full_mx= mx>>2;
2772     const int full_my= my>>2;
2773     const int pic_width  = 16*s->mb_width;
2774     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2775
2776     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2777         return;
2778
2779     if(mx&7) extra_width -= 3;
2780     if(my&7) extra_height -= 3;
2781
2782     if(   full_mx < 0-extra_width
2783        || full_my < 0-extra_height
2784        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2785        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2786         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2787             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2788         emu=1;
2789     }
2790
2791     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2792     if(!square){
2793         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2794     }
2795
2796     if(s->flags&CODEC_FLAG_GRAY) return;
2797
2798     if(MB_MBAFF){
2799         // chroma offset when predicting from a field of opposite parity
2800         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2801         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2802     }
2803     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2804     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2805
2806     if(emu){
2807         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2808             src_cb= s->edge_emu_buffer;
2809     }
2810     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2811
2812     if(emu){
2813         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2814             src_cr= s->edge_emu_buffer;
2815     }
2816     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2817 }
2818
2819 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2820                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2821                            int x_offset, int y_offset,
2822                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2823                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2824                            int list0, int list1){
2825     MpegEncContext * const s = &h->s;
2826     qpel_mc_func *qpix_op=  qpix_put;
2827     h264_chroma_mc_func chroma_op= chroma_put;
2828
2829     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2830     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2831     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2832     x_offset += 8*s->mb_x;
2833     y_offset += 8*(s->mb_y >> MB_MBAFF);
2834
2835     if(list0){
2836         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2837         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2838                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2839                            qpix_op, chroma_op);
2840
2841         qpix_op=  qpix_avg;
2842         chroma_op= chroma_avg;
2843     }
2844
2845     if(list1){
2846         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2847         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2848                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2849                            qpix_op, chroma_op);
2850     }
2851 }
2852
2853 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2854                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2855                            int x_offset, int y_offset,
2856                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2857                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2858                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2859                            int list0, int list1){
2860     MpegEncContext * const s = &h->s;
2861
2862     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2863     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2864     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2865     x_offset += 8*s->mb_x;
2866     y_offset += 8*(s->mb_y >> MB_MBAFF);
2867
2868     if(list0 && list1){
2869         /* don't optimize for luma-only case, since B-frames usually
2870          * use implicit weights => chroma too. */
2871         uint8_t *tmp_cb = s->obmc_scratchpad;
2872         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2873         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2874         int refn0 = h->ref_cache[0][ scan8[n] ];
2875         int refn1 = h->ref_cache[1][ scan8[n] ];
2876
2877         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2878                     dest_y, dest_cb, dest_cr,
2879                     x_offset, y_offset, qpix_put, chroma_put);
2880         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2881                     tmp_y, tmp_cb, tmp_cr,
2882                     x_offset, y_offset, qpix_put, chroma_put);
2883
2884         if(h->use_weight == 2){
2885             int weight0 = h->implicit_weight[refn0][refn1];
2886             int weight1 = 64 - weight0;
2887             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2888             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2889             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2890         }else{
2891             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2892                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2893                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2894             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2895                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2896                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2897             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2898                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2899                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2900         }
2901     }else{
2902         int list = list1 ? 1 : 0;
2903         int refn = h->ref_cache[list][ scan8[n] ];
2904         Picture *ref= &h->ref_list[list][refn];
2905         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2906                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2907                     qpix_put, chroma_put);
2908
2909         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2910                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2911         if(h->use_weight_chroma){
2912             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2913                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2914             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2915                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2916         }
2917     }
2918 }
2919
2920 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2921                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2922                            int x_offset, int y_offset,
2923                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2924                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2925                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2926                            int list0, int list1){
2927     if((h->use_weight==2 && list0 && list1
2928         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2929        || h->use_weight==1)
2930         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2931                          x_offset, y_offset, qpix_put, chroma_put,
2932                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2933     else
2934         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2935                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2936 }
2937
2938 static inline void prefetch_motion(H264Context *h, int list){
2939     /* fetch pixels for estimated mv 4 macroblocks ahead
2940      * optimized for 64byte cache lines */
2941     MpegEncContext * const s = &h->s;
2942     const int refn = h->ref_cache[list][scan8[0]];
2943     if(refn >= 0){
2944         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2945         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2946         uint8_t **src= h->ref_list[list][refn].data;
2947         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2948         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2949         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2950         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2951     }
2952 }
2953
2954 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2955                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2956                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2957                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2958     MpegEncContext * const s = &h->s;
2959     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2960     const int mb_type= s->current_picture.mb_type[mb_xy];
2961
2962     assert(IS_INTER(mb_type));
2963
2964     prefetch_motion(h, 0);
2965
2966     if(IS_16X16(mb_type)){
2967         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2968                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2969                 &weight_op[0], &weight_avg[0],
2970                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2971     }else if(IS_16X8(mb_type)){
2972         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2973                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2974                 &weight_op[1], &weight_avg[1],
2975                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2976         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2977                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2978                 &weight_op[1], &weight_avg[1],
2979                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2980     }else if(IS_8X16(mb_type)){
2981         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2982                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2983                 &weight_op[2], &weight_avg[2],
2984                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2985         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2986                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2987                 &weight_op[2], &weight_avg[2],
2988                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2989     }else{
2990         int i;
2991
2992         assert(IS_8X8(mb_type));
2993
2994         for(i=0; i<4; i++){
2995             const int sub_mb_type= h->sub_mb_type[i];
2996             const int n= 4*i;
2997             int x_offset= (i&1)<<2;
2998             int y_offset= (i&2)<<1;
2999
3000             if(IS_SUB_8X8(sub_mb_type)){
3001                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3002                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3003                     &weight_op[3], &weight_avg[3],
3004                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3005             }else if(IS_SUB_8X4(sub_mb_type)){
3006                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3007                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3008                     &weight_op[4], &weight_avg[4],
3009                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3010                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3011                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3012                     &weight_op[4], &weight_avg[4],
3013                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3014             }else if(IS_SUB_4X8(sub_mb_type)){
3015                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3016                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3017                     &weight_op[5], &weight_avg[5],
3018                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3019                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3020                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3021                     &weight_op[5], &weight_avg[5],
3022                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3023             }else{
3024                 int j;
3025                 assert(IS_SUB_4X4(sub_mb_type));
3026                 for(j=0; j<4; j++){
3027                     int sub_x_offset= x_offset + 2*(j&1);
3028                     int sub_y_offset= y_offset +   (j&2);
3029                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3030                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3031                         &weight_op[6], &weight_avg[6],
3032                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3033                 }
3034             }
3035         }
3036     }
3037
3038     prefetch_motion(h, 1);
3039 }
3040
3041 static void decode_init_vlc(){
3042     static int done = 0;
3043
3044     if (!done) {
3045         int i;
3046         done = 1;
3047
3048         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3049                  &chroma_dc_coeff_token_len [0], 1, 1,
3050                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3051
3052         for(i=0; i<4; i++){
3053             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3054                      &coeff_token_len [i][0], 1, 1,
3055                      &coeff_token_bits[i][0], 1, 1, 1);
3056         }
3057
3058         for(i=0; i<3; i++){
3059             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3060                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3061                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3062         }
3063         for(i=0; i<15; i++){
3064             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3065                      &total_zeros_len [i][0], 1, 1,
3066                      &total_zeros_bits[i][0], 1, 1, 1);
3067         }
3068
3069         for(i=0; i<6; i++){
3070             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3071                      &run_len [i][0], 1, 1,
3072                      &run_bits[i][0], 1, 1, 1);
3073         }
3074         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3075                  &run_len [6][0], 1, 1,
3076                  &run_bits[6][0], 1, 1, 1);
3077     }
3078 }
3079
3080 /**
3081  * Sets the intra prediction function pointers.
3082  */
3083 static void init_pred_ptrs(H264Context *h){
3084 //    MpegEncContext * const s = &h->s;
3085
3086     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3087     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3088     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3089     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3090     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3091     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3092     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3093     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3094     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3095     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3096     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3097     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3098
3099     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3100     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3101     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3102     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3103     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3104     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3105     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3106     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3107     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3108     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3109     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3110     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3111
3112     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
3113     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
3114     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
3115     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
3116     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3117     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3118     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
3119
3120     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
3121     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
3122     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
3123     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
3124     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3125     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3126     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
3127 }
3128
3129 static void free_tables(H264Context *h){
3130     av_freep(&h->intra4x4_pred_mode);
3131     av_freep(&h->chroma_pred_mode_table);
3132     av_freep(&h->cbp_table);
3133     av_freep(&h->mvd_table[0]);
3134     av_freep(&h->mvd_table[1]);
3135     av_freep(&h->direct_table);
3136     av_freep(&h->non_zero_count);
3137     av_freep(&h->slice_table_base);
3138     av_freep(&h->top_borders[1]);
3139     av_freep(&h->top_borders[0]);
3140     h->slice_table= NULL;
3141
3142     av_freep(&h->mb2b_xy);
3143     av_freep(&h->mb2b8_xy);
3144
3145     av_freep(&h->s.obmc_scratchpad);
3146 }
3147
3148 static void init_dequant8_coeff_table(H264Context *h){
3149     int i,q,x;
3150     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3151     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3152     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3153
3154     for(i=0; i<2; i++ ){
3155         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3156             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3157             break;
3158         }
3159
3160         for(q=0; q<52; q++){
3161             int shift = ff_div6[q];
3162             int idx = ff_rem6[q];
3163             for(x=0; x<64; x++)
3164                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3165                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3166                     h->pps.scaling_matrix8[i][x]) << shift;
3167         }
3168     }
3169 }
3170
3171 static void init_dequant4_coeff_table(H264Context *h){
3172     int i,j,q,x;
3173     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3174     for(i=0; i<6; i++ ){
3175         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3176         for(j=0; j<i; j++){
3177             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3178                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3179                 break;
3180             }
3181         }
3182         if(j<i)
3183             continue;
3184
3185         for(q=0; q<52; q++){
3186             int shift = ff_div6[q] + 2;
3187             int idx = ff_rem6[q];
3188             for(x=0; x<16; x++)
3189                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3190                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3191                     h->pps.scaling_matrix4[i][x]) << shift;
3192         }
3193     }
3194 }
3195
3196 static void init_dequant_tables(H264Context *h){
3197     int i,x;
3198     init_dequant4_coeff_table(h);
3199     if(h->pps.transform_8x8_mode)
3200         init_dequant8_coeff_table(h);
3201     if(h->sps.transform_bypass){
3202         for(i=0; i<6; i++)
3203             for(x=0; x<16; x++)
3204                 h->dequant4_coeff[i][0][x] = 1<<6;
3205         if(h->pps.transform_8x8_mode)
3206             for(i=0; i<2; i++)
3207                 for(x=0; x<64; x++)
3208                     h->dequant8_coeff[i][0][x] = 1<<6;
3209     }
3210 }
3211
3212
3213 /**
3214  * allocates tables.
3215  * needs width/height
3216  */
3217 static int alloc_tables(H264Context *h){
3218     MpegEncContext * const s = &h->s;
3219     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3220     int x,y;
3221
3222     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3223
3224     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3225     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3226     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3227     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3228     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3229
3230     if( h->pps.cabac ) {
3231         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3232         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3233         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3234         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3235     }
3236
3237     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3238     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3239
3240     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3241     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3242     for(y=0; y<s->mb_height; y++){
3243         for(x=0; x<s->mb_width; x++){
3244             const int mb_xy= x + y*s->mb_stride;
3245             const int b_xy = 4*x + 4*y*h->b_stride;
3246             const int b8_xy= 2*x + 2*y*h->b8_stride;
3247
3248             h->mb2b_xy [mb_xy]= b_xy;
3249             h->mb2b8_xy[mb_xy]= b8_xy;
3250         }
3251     }
3252
3253     s->obmc_scratchpad = NULL;
3254
3255     if(!h->dequant4_coeff[0])
3256         init_dequant_tables(h);
3257
3258     return 0;
3259 fail:
3260     free_tables(h);
3261     return -1;
3262 }
3263
3264 static void common_init(H264Context *h){
3265     MpegEncContext * const s = &h->s;
3266
3267     s->width = s->avctx->width;
3268     s->height = s->avctx->height;
3269     s->codec_id= s->avctx->codec->id;
3270
3271     init_pred_ptrs(h);
3272
3273     h->dequant_coeff_pps= -1;
3274     s->unrestricted_mv=1;
3275     s->decode=1; //FIXME
3276
3277     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3278     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3279 }
3280
3281 static int decode_init(AVCodecContext *avctx){
3282     H264Context *h= avctx->priv_data;
3283     MpegEncContext * const s = &h->s;
3284
3285     MPV_decode_defaults(s);
3286
3287     s->avctx = avctx;
3288     common_init(h);
3289
3290     s->out_format = FMT_H264;
3291     s->workaround_bugs= avctx->workaround_bugs;
3292
3293     // set defaults
3294 //    s->decode_mb= ff_h263_decode_mb;
3295     s->low_delay= 1;
3296     avctx->pix_fmt= PIX_FMT_YUV420P;
3297
3298     decode_init_vlc();
3299
3300     if(avctx->extradata_size > 0 && avctx->extradata &&
3301        *(char *)avctx->extradata == 1){
3302         h->is_avc = 1;
3303         h->got_avcC = 0;
3304     } else {
3305         h->is_avc = 0;
3306     }
3307
3308     return 0;
3309 }
3310
3311 static int frame_start(H264Context *h){
3312     MpegEncContext * const s = &h->s;
3313     int i;
3314
3315     if(MPV_frame_start(s, s->avctx) < 0)
3316         return -1;
3317     ff_er_frame_start(s);
3318
3319     assert(s->linesize && s->uvlinesize);
3320
3321     for(i=0; i<16; i++){
3322         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3323         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3324     }
3325     for(i=0; i<4; i++){
3326         h->block_offset[16+i]=
3327         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3328         h->block_offset[24+16+i]=
3329         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3330     }
3331
3332     /* can't be in alloc_tables because linesize isn't known there.
3333      * FIXME: redo bipred weight to not require extra buffer? */
3334     if(!s->obmc_scratchpad)
3335         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3336
3337     /* some macroblocks will be accessed before they're available */
3338     if(FRAME_MBAFF)
3339         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3340
3341 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3342     return 0;
3343 }
3344
3345 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3346     MpegEncContext * const s = &h->s;
3347     int i;
3348
3349     src_y  -=   linesize;
3350     src_cb -= uvlinesize;
3351     src_cr -= uvlinesize;
3352
3353     // There are two lines saved, the line above the the top macroblock of a pair,
3354     // and the line above the bottom macroblock
3355     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3356     for(i=1; i<17; i++){
3357         h->left_border[i]= src_y[15+i*  linesize];
3358     }
3359
3360     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3361     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3362
3363     if(!(s->flags&CODEC_FLAG_GRAY)){
3364         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3365         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3366         for(i=1; i<9; i++){
3367             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3368             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3369         }
3370         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3371         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3372     }
3373 }
3374
3375 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3376     MpegEncContext * const s = &h->s;
3377     int temp8, i;
3378     uint64_t temp64;
3379     int deblock_left = (s->mb_x > 0);
3380     int deblock_top  = (s->mb_y > 0);
3381
3382     src_y  -=   linesize + 1;
3383     src_cb -= uvlinesize + 1;
3384     src_cr -= uvlinesize + 1;
3385
3386 #define XCHG(a,b,t,xchg)\
3387 t= a;\
3388 if(xchg)\
3389     a= b;\
3390 b= t;
3391
3392     if(deblock_left){
3393         for(i = !deblock_top; i<17; i++){
3394             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3395         }
3396     }
3397
3398     if(deblock_top){
3399         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3400         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3401         if(s->mb_x+1 < s->mb_width){
3402             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3403         }
3404     }
3405
3406     if(!(s->flags&CODEC_FLAG_GRAY)){
3407         if(deblock_left){
3408             for(i = !deblock_top; i<9; i++){
3409                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3410                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3411             }
3412         }
3413         if(deblock_top){
3414             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3415             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3416         }
3417     }
3418 }
3419
3420 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3421     MpegEncContext * const s = &h->s;
3422     int i;
3423
3424     src_y  -= 2 *   linesize;
3425     src_cb -= 2 * uvlinesize;
3426     src_cr -= 2 * uvlinesize;
3427
3428     // There are two lines saved, the line above the the top macroblock of a pair,
3429     // and the line above the bottom macroblock
3430     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3431     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3432     for(i=2; i<34; i++){
3433         h->left_border[i]= src_y[15+i*  linesize];
3434     }
3435
3436     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3437     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3438     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3439     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3440
3441     if(!(s->flags&CODEC_FLAG_GRAY)){
3442         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3443         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3444         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3445         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3446         for(i=2; i<18; i++){
3447             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3448             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3449         }
3450         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3451         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3452         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3453         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3454     }
3455 }
3456
3457 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3458     MpegEncContext * const s = &h->s;
3459     int temp8, i;
3460     uint64_t temp64;
3461     int deblock_left = (s->mb_x > 0);
3462     int deblock_top  = (s->mb_y > 1);
3463
3464     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3465
3466     src_y  -= 2 *   linesize + 1;
3467     src_cb -= 2 * uvlinesize + 1;
3468     src_cr -= 2 * uvlinesize + 1;
3469
3470 #define XCHG(a,b,t,xchg)\
3471 t= a;\
3472 if(xchg)\
3473     a= b;\
3474 b= t;
3475
3476     if(deblock_left){
3477         for(i = (!deblock_top)<<1; i<34; i++){
3478             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3479         }
3480     }
3481
3482     if(deblock_top){
3483         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3484         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3485         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3486         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3487         if(s->mb_x+1 < s->mb_width){
3488             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3489             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3490         }
3491     }
3492
3493     if(!(s->flags&CODEC_FLAG_GRAY)){
3494         if(deblock_left){
3495             for(i = (!deblock_top) << 1; i<18; i++){
3496                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3497                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3498             }
3499         }
3500         if(deblock_top){
3501             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3502             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3503             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3504             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3505         }
3506     }
3507 }
3508
3509 static void hl_decode_mb(H264Context *h){
3510     MpegEncContext * const s = &h->s;
3511     const int mb_x= s->mb_x;
3512     const int mb_y= s->mb_y;
3513     const int mb_xy= mb_x + mb_y*s->mb_stride;
3514     const int mb_type= s->current_picture.mb_type[mb_xy];
3515     uint8_t  *dest_y, *dest_cb, *dest_cr;
3516     int linesize, uvlinesize /*dct_offset*/;
3517     int i;
3518     int *block_offset = &h->block_offset[0];
3519     const unsigned int bottom = mb_y & 1;
3520     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3521     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3522     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3523
3524     if(!s->decode)
3525         return;
3526
3527     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3528     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3529     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3530
3531     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3532     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3533
3534     if (MB_FIELD) {
3535         linesize   = h->mb_linesize   = s->linesize * 2;
3536         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3537         block_offset = &h->block_offset[24];
3538         if(mb_y&1){ //FIXME move out of this func?
3539             dest_y -= s->linesize*15;
3540             dest_cb-= s->uvlinesize*7;
3541             dest_cr-= s->uvlinesize*7;
3542         }
3543         if(FRAME_MBAFF) {
3544             int list;
3545             for(list=0; list<2; list++){
3546                 if(!USES_LIST(mb_type, list))
3547                     continue;
3548                 if(IS_16X16(mb_type)){
3549                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3550                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3551                 }else{
3552                     for(i=0; i<16; i+=4){
3553                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3554                         int ref = h->ref_cache[list][scan8[i]];
3555                         if(ref >= 0)
3556                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3557                     }
3558                 }
3559             }
3560         }
3561     } else {
3562         linesize   = h->mb_linesize   = s->linesize;
3563         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3564 //        dct_offset = s->linesize * 16;
3565     }
3566
3567     if(transform_bypass){
3568         idct_dc_add =
3569         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3570     }else if(IS_8x8DCT(mb_type)){
3571         idct_dc_add = s->dsp.h264_idct8_dc_add;
3572         idct_add = s->dsp.h264_idct8_add;
3573     }else{
3574         idct_dc_add = s->dsp.h264_idct_dc_add;
3575         idct_add = s->dsp.h264_idct_add;
3576     }
3577
3578     if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3579        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3580         int mbt_y = mb_y&~1;
3581         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3582         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3583         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3584         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3585     }
3586
3587     if (IS_INTRA_PCM(mb_type)) {
3588         unsigned int x, y;
3589
3590         // The pixels are stored in h->mb array in the same order as levels,
3591         // copy them in output in the correct order.
3592         for(i=0; i<16; i++) {
3593             for (y=0; y<4; y++) {
3594                 for (x=0; x<4; x++) {
3595                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3596                 }
3597             }
3598         }
3599         for(i=16; i<16+4; i++) {
3600             for (y=0; y<4; y++) {
3601                 for (x=0; x<4; x++) {
3602                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3603                 }
3604             }
3605         }
3606         for(i=20; i<20+4; i++) {
3607             for (y=0; y<4; y++) {
3608                 for (x=0; x<4; x++) {
3609                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3610                 }
3611             }
3612         }
3613     } else {
3614         if(IS_INTRA(mb_type)){
3615             if(h->deblocking_filter && !FRAME_MBAFF)
3616                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3617
3618             if(!(s->flags&CODEC_FLAG_GRAY)){
3619                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3620                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3621             }
3622
3623             if(IS_INTRA4x4(mb_type)){
3624                 if(!s->encoding){
3625                     if(IS_8x8DCT(mb_type)){
3626                         for(i=0; i<16; i+=4){
3627                             uint8_t * const ptr= dest_y + block_offset[i];
3628                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3629                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3630                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3631                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3632                             if(nnz){
3633                                 if(nnz == 1 && h->mb[i*16])
3634                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3635                                 else
3636                                     idct_add(ptr, h->mb + i*16, linesize);
3637                             }
3638                         }
3639                     }else
3640                     for(i=0; i<16; i++){
3641                         uint8_t * const ptr= dest_y + block_offset[i];
3642                         uint8_t *topright;
3643                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3644                         int nnz, tr;
3645
3646                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3647                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3648                             assert(mb_y || linesize <= block_offset[i]);
3649                             if(!topright_avail){
3650                                 tr= ptr[3 - linesize]*0x01010101;
3651                                 topright= (uint8_t*) &tr;
3652                             }else
3653                                 topright= ptr + 4 - linesize;
3654                         }else
3655                             topright= NULL;
3656
3657                         h->pred4x4[ dir ](ptr, topright, linesize);
3658                         nnz = h->non_zero_count_cache[ scan8[i] ];
3659                         if(nnz){
3660                             if(s->codec_id == CODEC_ID_H264){
3661                                 if(nnz == 1 && h->mb[i*16])
3662                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3663                                 else
3664                                     idct_add(ptr, h->mb + i*16, linesize);
3665                             }else
3666                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3667                         }
3668                     }
3669                 }
3670             }else{
3671                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3672                 if(s->codec_id == CODEC_ID_H264){
3673                     if(!transform_bypass)
3674                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3675                 }else
3676                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3677             }
3678             if(h->deblocking_filter && !FRAME_MBAFF)
3679                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3680         }else if(s->codec_id == CODEC_ID_H264){
3681             hl_motion(h, dest_y, dest_cb, dest_cr,
3682                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3683                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3684                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3685         }
3686
3687
3688         if(!IS_INTRA4x4(mb_type)){
3689             if(s->codec_id == CODEC_ID_H264){
3690                 if(IS_INTRA16x16(mb_type)){
3691                     for(i=0; i<16; i++){
3692                         if(h->non_zero_count_cache[ scan8[i] ])
3693                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3694                         else if(h->mb[i*16])
3695                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3696                     }
3697                 }else{
3698                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3699                     for(i=0; i<16; i+=di){
3700                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3701                         if(nnz){
3702                             if(nnz==1 && h->mb[i*16])
3703                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3704                             else
3705                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3706                         }
3707                     }
3708                 }
3709             }else{
3710                 for(i=0; i<16; i++){
3711                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3712                         uint8_t * const ptr= dest_y + block_offset[i];
3713                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3714                     }
3715                 }
3716             }
3717         }
3718
3719         if(!(s->flags&CODEC_FLAG_GRAY)){
3720             uint8_t *dest[2] = {dest_cb, dest_cr};
3721             if(transform_bypass){
3722                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3723             }else{
3724                 idct_add = s->dsp.h264_idct_add;
3725                 idct_dc_add = s->dsp.h264_idct_dc_add;
3726                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3727                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3728             }
3729             if(s->codec_id == CODEC_ID_H264){
3730                 for(i=16; i<16+8; i++){
3731                     if(h->non_zero_count_cache[ scan8[i] ])
3732                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3733                     else if(h->mb[i*16])
3734                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3735                 }
3736             }else{
3737                 for(i=16; i<16+8; i++){
3738                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3739                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3740                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3741                     }
3742                 }
3743             }
3744         }
3745     }
3746     if(h->deblocking_filter) {
3747         if (FRAME_MBAFF) {
3748             //FIXME try deblocking one mb at a time?
3749             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3750             const int mb_y = s->mb_y - 1;
3751             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3752             const int mb_xy= mb_x + mb_y*s->mb_stride;
3753             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3754             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3755             if (!bottom) return;
3756             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3757             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3758             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3759
3760             if(IS_INTRA(mb_type_top | mb_type_bottom))
3761                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3762
3763             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3764             // deblock a pair
3765             // top
3766             s->mb_y--;
3767             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3768             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3769             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3770             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3771             // bottom
3772             s->mb_y++;
3773             tprintf("call mbaff filter_mb\n");
3774             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3775             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3776             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3777         } else {
3778             tprintf("call filter_mb\n");
3779             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3780             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3781             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3782         }
3783     }
3784 }
3785
3786 /**
3787  * fills the default_ref_list.
3788  */
3789 static int fill_default_ref_list(H264Context *h){
3790     MpegEncContext * const s = &h->s;
3791     int i;
3792     int smallest_poc_greater_than_current = -1;
3793     Picture sorted_short_ref[32];
3794
3795     if(h->slice_type==B_TYPE){
3796         int out_i;
3797         int limit= INT_MIN;
3798
3799         /* sort frame according to poc in B slice */
3800         for(out_i=0; out_i<h->short_ref_count; out_i++){
3801             int best_i=INT_MIN;
3802             int best_poc=INT_MAX;
3803
3804             for(i=0; i<h->short_ref_count; i++){
3805                 const int poc= h->short_ref[i]->poc;
3806                 if(poc > limit && poc < best_poc){
3807                     best_poc= poc;
3808                     best_i= i;
3809                 }
3810             }
3811
3812             assert(best_i != INT_MIN);
3813
3814             limit= best_poc;
3815             sorted_short_ref[out_i]= *h->short_ref[best_i];
3816             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3817             if (-1 == smallest_poc_greater_than_current) {
3818                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3819                     smallest_poc_greater_than_current = out_i;
3820                 }
3821             }
3822         }
3823     }
3824
3825     if(s->picture_structure == PICT_FRAME){
3826         if(h->slice_type==B_TYPE){
3827             int list;
3828             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3829
3830             // find the largest poc
3831             for(list=0; list<2; list++){
3832                 int index = 0;
3833                 int j= -99;
3834                 int step= list ? -1 : 1;
3835
3836                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3837                     while(j<0 || j>= h->short_ref_count){
3838                         if(j != -99 && step == (list ? -1 : 1))
3839                             return -1;
3840                         step = -step;
3841                         j= smallest_poc_greater_than_current + (step>>1);
3842                     }
3843                     if(sorted_short_ref[j].reference != 3) continue;
3844                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3845                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3846                 }
3847
3848                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3849                     if(h->long_ref[i] == NULL) continue;
3850                     if(h->long_ref[i]->reference != 3) continue;
3851
3852                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3853                     h->default_ref_list[ list ][index++].pic_id= i;;
3854                 }
3855
3856                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3857                     // swap the two first elements of L1 when
3858                     // L0 and L1 are identical
3859                     Picture temp= h->default_ref_list[1][0];
3860                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3861                     h->default_ref_list[1][1] = temp;
3862                 }
3863
3864                 if(index < h->ref_count[ list ])
3865                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3866             }
3867         }else{
3868             int index=0;
3869             for(i=0; i<h->short_ref_count; i++){
3870                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3871                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3872                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3873             }
3874             for(i = 0; i < 16; i++){
3875                 if(h->long_ref[i] == NULL) continue;
3876                 if(h->long_ref[i]->reference != 3) continue;
3877                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3878                 h->default_ref_list[0][index++].pic_id= i;;
3879             }
3880             if(index < h->ref_count[0])
3881                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3882         }
3883     }else{ //FIELD
3884         if(h->slice_type==B_TYPE){
3885         }else{
3886             //FIXME second field balh
3887         }
3888     }
3889 #ifdef TRACE
3890     for (i=0; i<h->ref_count[0]; i++) {
3891         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3892     }
3893     if(h->slice_type==B_TYPE){
3894         for (i=0; i<h->ref_count[1]; i++) {
3895             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3896         }
3897     }
3898 #endif
3899     return 0;
3900 }
3901
3902 static void print_short_term(H264Context *h);
3903 static void print_long_term(H264Context *h);
3904
3905 static int decode_ref_pic_list_reordering(H264Context *h){
3906     MpegEncContext * const s = &h->s;
3907     int list, index;
3908
3909     print_short_term(h);
3910     print_long_term(h);
3911     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3912
3913     for(list=0; list<2; list++){
3914         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3915
3916         if(get_bits1(&s->gb)){
3917             int pred= h->curr_pic_num;
3918
3919             for(index=0; ; index++){
3920                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3921                 unsigned int pic_id;
3922                 int i;
3923                 Picture *ref = NULL;
3924
3925                 if(reordering_of_pic_nums_idc==3)
3926                     break;
3927
3928                 if(index >= h->ref_count[list]){
3929                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3930                     return -1;
3931                 }
3932
3933                 if(reordering_of_pic_nums_idc<3){
3934                     if(reordering_of_pic_nums_idc<2){
3935                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3936
3937                         if(abs_diff_pic_num >= h->max_pic_num){
3938                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3939                             return -1;
3940                         }
3941
3942                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3943                         else                                pred+= abs_diff_pic_num;
3944                         pred &= h->max_pic_num - 1;
3945
3946                         for(i= h->short_ref_count-1; i>=0; i--){
3947                             ref = h->short_ref[i];
3948                             assert(ref->reference == 3);
3949                             assert(!ref->long_ref);
3950                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3951                                 break;
3952                         }
3953                         if(i>=0)
3954                             ref->pic_id= ref->frame_num;
3955                     }else{
3956                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3957                         if(pic_id>31){
3958                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3959                             return -1;
3960                         }
3961                         ref = h->long_ref[pic_id];
3962                         if(ref){
3963                             ref->pic_id= pic_id;
3964                             assert(ref->reference == 3);
3965                             assert(ref->long_ref);
3966                             i=0;
3967                         }else{
3968                             i=-1;
3969                         }
3970                     }
3971
3972                     if (i < 0) {
3973                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3974                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3975                     } else {
3976                         for(i=index; i+1<h->ref_count[list]; i++){
3977                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3978                                 break;
3979                         }
3980                         for(; i > index; i--){
3981                             h->ref_list[list][i]= h->ref_list[list][i-1];
3982                         }
3983                         h->ref_list[list][index]= *ref;
3984                     }
3985                 }else{
3986                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3987                     return -1;
3988                 }
3989             }
3990         }
3991
3992         if(h->slice_type!=B_TYPE) break;
3993     }
3994     for(list=0; list<2; list++){
3995         for(index= 0; index < h->ref_count[list]; index++){
3996             if(!h->ref_list[list][index].data[0])
3997                 h->ref_list[list][index]= s->current_picture;
3998         }
3999         if(h->slice_type!=B_TYPE) break;
4000     }
4001
4002     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4003         direct_dist_scale_factor(h);
4004     direct_ref_list_init(h);
4005     return 0;
4006 }
4007
4008 static void fill_mbaff_ref_list(H264Context *h){
4009     int list, i, j;
4010     for(list=0; list<2; list++){
4011         for(i=0; i<h->ref_count[list]; i++){
4012             Picture *frame = &h->ref_list[list][i];
4013             Picture *field = &h->ref_list[list][16+2*i];
4014             field[0] = *frame;
4015             for(j=0; j<3; j++)
4016                 field[0].linesize[j] <<= 1;
4017             field[1] = field[0];
4018             for(j=0; j<3; j++)
4019                 field[1].data[j] += frame->linesize[j];
4020
4021             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4022             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4023             for(j=0; j<2; j++){
4024                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4025                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4026             }
4027         }
4028     }
4029     for(j=0; j<h->ref_count[1]; j++){
4030         for(i=0; i<h->ref_count[0]; i++)
4031             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4032         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4033         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4034     }
4035 }
4036
4037 static int pred_weight_table(H264Context *h){
4038     MpegEncContext * const s = &h->s;
4039     int list, i;
4040     int luma_def, chroma_def;
4041
4042     h->use_weight= 0;
4043     h->use_weight_chroma= 0;
4044     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4045     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4046     luma_def = 1<<h->luma_log2_weight_denom;
4047     chroma_def = 1<<h->chroma_log2_weight_denom;
4048
4049     for(list=0; list<2; list++){
4050         for(i=0; i<h->ref_count[list]; i++){
4051             int luma_weight_flag, chroma_weight_flag;
4052
4053             luma_weight_flag= get_bits1(&s->gb);
4054             if(luma_weight_flag){
4055                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4056                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4057                 if(   h->luma_weight[list][i] != luma_def
4058                    || h->luma_offset[list][i] != 0)
4059                     h->use_weight= 1;
4060             }else{
4061                 h->luma_weight[list][i]= luma_def;
4062                 h->luma_offset[list][i]= 0;
4063             }
4064
4065             chroma_weight_flag= get_bits1(&s->gb);
4066             if(chroma_weight_flag){
4067                 int j;
4068                 for(j=0; j<2; j++){
4069                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4070                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4071                     if(   h->chroma_weight[list][i][j] != chroma_def
4072                        || h->chroma_offset[list][i][j] != 0)
4073                         h->use_weight_chroma= 1;
4074                 }
4075             }else{
4076                 int j;
4077                 for(j=0; j<2; j++){
4078                     h->chroma_weight[list][i][j]= chroma_def;
4079                     h->chroma_offset[list][i][j]= 0;
4080                 }
4081             }
4082         }
4083         if(h->slice_type != B_TYPE) break;
4084     }
4085     h->use_weight= h->use_weight || h->use_weight_chroma;
4086     return 0;
4087 }
4088
4089 static void implicit_weight_table(H264Context *h){
4090     MpegEncContext * const s = &h->s;
4091     int ref0, ref1;
4092     int cur_poc = s->current_picture_ptr->poc;
4093
4094     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4095        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4096         h->use_weight= 0;
4097         h->use_weight_chroma= 0;
4098         return;
4099     }
4100
4101     h->use_weight= 2;
4102     h->use_weight_chroma= 2;
4103     h->luma_log2_weight_denom= 5;
4104     h->chroma_log2_weight_denom= 5;
4105
4106     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4107         int poc0 = h->ref_list[0][ref0].poc;
4108         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4109             int poc1 = h->ref_list[1][ref1].poc;
4110             int td = clip(poc1 - poc0, -128, 127);
4111             if(td){
4112                 int tb = clip(cur_poc - poc0, -128, 127);
4113                 int tx = (16384 + (FFABS(td) >> 1)) / td;
4114                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4115                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4116                     h->implicit_weight[ref0][ref1] = 32;
4117                 else
4118                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4119             }else
4120                 h->implicit_weight[ref0][ref1] = 32;
4121         }
4122     }
4123 }
4124
4125 static inline void unreference_pic(H264Context *h, Picture *pic){
4126     int i;
4127     pic->reference=0;
4128     if(pic == h->delayed_output_pic)
4129         pic->reference=1;
4130     else{
4131         for(i = 0; h->delayed_pic[i]; i++)
4132             if(pic == h->delayed_pic[i]){
4133                 pic->reference=1;
4134                 break;
4135             }
4136     }
4137 }
4138
4139 /**
4140  * instantaneous decoder refresh.
4141  */
4142 static void idr(H264Context *h){
4143     int i;
4144
4145     for(i=0; i<16; i++){
4146         if (h->long_ref[i] != NULL) {
4147             unreference_pic(h, h->long_ref[i]);
4148             h->long_ref[i]= NULL;
4149         }
4150     }
4151     h->long_ref_count=0;
4152
4153     for(i=0; i<h->short_ref_count; i++){
4154         unreference_pic(h, h->short_ref[i]);
4155         h->short_ref[i]= NULL;
4156     }
4157     h->short_ref_count=0;
4158 }
4159
4160 /* forget old pics after a seek */
4161 static void flush_dpb(AVCodecContext *avctx){
4162     H264Context *h= avctx->priv_data;
4163     int i;
4164     for(i=0; i<16; i++) {
4165         if(h->delayed_pic[i])
4166             h->delayed_pic[i]->reference= 0;
4167         h->delayed_pic[i]= NULL;
4168     }
4169     if(h->delayed_output_pic)
4170         h->delayed_output_pic->reference= 0;
4171     h->delayed_output_pic= NULL;
4172     idr(h);
4173     if(h->s.current_picture_ptr)
4174         h->s.current_picture_ptr->reference= 0;
4175 }
4176
4177 /**
4178  *
4179  * @return the removed picture or NULL if an error occurs
4180  */
4181 static Picture * remove_short(H264Context *h, int frame_num){
4182     MpegEncContext * const s = &h->s;
4183     int i;
4184
4185     if(s->avctx->debug&FF_DEBUG_MMCO)
4186         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4187
4188     for(i=0; i<h->short_ref_count; i++){
4189         Picture *pic= h->short_ref[i];
4190         if(s->avctx->debug&FF_DEBUG_MMCO)
4191             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4192         if(pic->frame_num == frame_num){
4193             h->short_ref[i]= NULL;
4194             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4195             h->short_ref_count--;
4196             return pic;
4197         }
4198     }
4199     return NULL;
4200 }
4201
4202 /**
4203  *
4204  * @return the removed picture or NULL if an error occurs
4205  */
4206 static Picture * remove_long(H264Context *h, int i){
4207     Picture *pic;
4208
4209     pic= h->long_ref[i];
4210     h->long_ref[i]= NULL;
4211     if(pic) h->long_ref_count--;
4212
4213     return pic;
4214 }
4215
4216 /**
4217  * print short term list
4218  */
4219 static void print_short_term(H264Context *h) {
4220     uint32_t i;
4221     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4222         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4223         for(i=0; i<h->short_ref_count; i++){
4224             Picture *pic= h->short_ref[i];
4225             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4226         }
4227     }
4228 }
4229
4230 /**
4231  * print long term list
4232  */
4233 static void print_long_term(H264Context *h) {
4234     uint32_t i;
4235     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4236         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4237         for(i = 0; i < 16; i++){
4238             Picture *pic= h->long_ref[i];
4239             if (pic) {
4240                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4241             }
4242         }
4243     }
4244 }
4245
4246 /**
4247  * Executes the reference picture marking (memory management control operations).
4248  */
4249 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4250     MpegEncContext * const s = &h->s;
4251     int i, j;
4252     int current_is_long=0;
4253     Picture *pic;
4254
4255     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4256         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4257
4258     for(i=0; i<mmco_count; i++){
4259         if(s->avctx->debug&FF_DEBUG_MMCO)
4260             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4261
4262         switch(mmco[i].opcode){
4263         case MMCO_SHORT2UNUSED:
4264             pic= remove_short(h, mmco[i].short_frame_num);
4265             if(pic)
4266                 unreference_pic(h, pic);
4267             else if(s->avctx->debug&FF_DEBUG_MMCO)
4268                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4269             break;
4270         case MMCO_SHORT2LONG:
4271             pic= remove_long(h, mmco[i].long_index);
4272             if(pic) unreference_pic(h, pic);
4273
4274             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4275             if (h->long_ref[ mmco[i].long_index ]){
4276                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
4277                 h->long_ref_count++;
4278             }
4279             break;
4280         case MMCO_LONG2UNUSED:
4281             pic= remove_long(h, mmco[i].long_index);
4282             if(pic)
4283                 unreference_pic(h, pic);
4284             else if(s->avctx->debug&FF_DEBUG_MMCO)
4285                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4286             break;
4287         case MMCO_LONG:
4288             pic= remove_long(h, mmco[i].long_index);
4289             if(pic) unreference_pic(h, pic);
4290
4291             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4292             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4293             h->long_ref_count++;
4294
4295             current_is_long=1;
4296             break;
4297         case MMCO_SET_MAX_LONG:
4298             assert(mmco[i].long_index <= 16);
4299             // just remove the long term which index is greater than new max
4300             for(j = mmco[i].long_index; j<16; j++){
4301                 pic = remove_long(h, j);
4302                 if (pic) unreference_pic(h, pic);
4303             }
4304             break;
4305         case MMCO_RESET:
4306             while(h->short_ref_count){
4307                 pic= remove_short(h, h->short_ref[0]->frame_num);
4308                 if(pic) unreference_pic(h, pic);
4309             }
4310             for(j = 0; j < 16; j++) {
4311                 pic= remove_long(h, j);
4312                 if(pic) unreference_pic(h, pic);
4313             }
4314             break;
4315         default: assert(0);
4316         }
4317     }
4318
4319     if(!current_is_long){
4320         pic= remove_short(h, s->current_picture_ptr->frame_num);
4321         if(pic){
4322             unreference_pic(h, pic);
4323             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4324         }
4325
4326         if(h->short_ref_count)
4327             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4328
4329         h->short_ref[0]= s->current_picture_ptr;
4330         h->short_ref[0]->long_ref=0;
4331         h->short_ref_count++;
4332     }
4333
4334     print_short_term(h);
4335     print_long_term(h);
4336     return 0;
4337 }
4338
4339 static int decode_ref_pic_marking(H264Context *h){
4340     MpegEncContext * const s = &h->s;
4341     int i;
4342
4343     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4344         s->broken_link= get_bits1(&s->gb) -1;
4345         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4346         if(h->mmco[0].long_index == -1)
4347             h->mmco_index= 0;
4348         else{
4349             h->mmco[0].opcode= MMCO_LONG;
4350             h->mmco_index= 1;
4351         }
4352     }else{
4353         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4354             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4355                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4356
4357                 h->mmco[i].opcode= opcode;
4358                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4359                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4360 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4361                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4362                         return -1;
4363                     }*/
4364                 }
4365                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4366                     unsigned int long_index= get_ue_golomb(&s->gb);
4367                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4368                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4369                         return -1;
4370                     }
4371                     h->mmco[i].long_index= long_index;
4372                 }
4373
4374                 if(opcode > (unsigned)MMCO_LONG){
4375                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4376                     return -1;
4377                 }
4378                 if(opcode == MMCO_END)
4379                     break;
4380             }
4381             h->mmco_index= i;
4382         }else{
4383             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4384
4385             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4386                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4387                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4388                 h->mmco_index= 1;
4389             }else
4390                 h->mmco_index= 0;
4391         }
4392     }
4393
4394     return 0;
4395 }
4396
4397 static int init_poc(H264Context *h){
4398     MpegEncContext * const s = &h->s;
4399     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4400     int field_poc[2];
4401
4402     if(h->nal_unit_type == NAL_IDR_SLICE){
4403         h->frame_num_offset= 0;
4404     }else{
4405         if(h->frame_num < h->prev_frame_num)
4406             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4407         else
4408             h->frame_num_offset= h->prev_frame_num_offset;
4409     }
4410
4411     if(h->sps.poc_type==0){
4412         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4413
4414         if(h->nal_unit_type == NAL_IDR_SLICE){
4415              h->prev_poc_msb=
4416              h->prev_poc_lsb= 0;
4417         }
4418
4419         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4420             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4421         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4422             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4423         else
4424             h->poc_msb = h->prev_poc_msb;
4425 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4426         field_poc[0] =
4427         field_poc[1] = h->poc_msb + h->poc_lsb;
4428         if(s->picture_structure == PICT_FRAME)
4429             field_poc[1] += h->delta_poc_bottom;
4430     }else if(h->sps.poc_type==1){
4431         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4432         int i;
4433
4434         if(h->sps.poc_cycle_length != 0)
4435             abs_frame_num = h->frame_num_offset + h->frame_num;
4436         else
4437             abs_frame_num = 0;
4438
4439         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4440             abs_frame_num--;
4441
4442         expected_delta_per_poc_cycle = 0;
4443         for(i=0; i < h->sps.poc_cycle_length; i++)
4444             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4445
4446         if(abs_frame_num > 0){
4447             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4448             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4449
4450             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4451             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4452                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4453         } else
4454             expectedpoc = 0;
4455
4456         if(h->nal_ref_idc == 0)
4457             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4458
4459         field_poc[0] = expectedpoc + h->delta_poc[0];
4460         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4461
4462         if(s->picture_structure == PICT_FRAME)
4463             field_poc[1] += h->delta_poc[1];
4464     }else{
4465         int poc;
4466         if(h->nal_unit_type == NAL_IDR_SLICE){
4467             poc= 0;
4468         }else{
4469             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4470             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4471         }
4472         field_poc[0]= poc;
4473         field_poc[1]= poc;
4474     }
4475
4476     if(s->picture_structure != PICT_BOTTOM_FIELD)
4477         s->current_picture_ptr->field_poc[0]= field_poc[0];
4478     if(s->picture_structure != PICT_TOP_FIELD)
4479         s->current_picture_ptr->field_poc[1]= field_poc[1];
4480     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4481         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4482
4483     return 0;
4484 }
4485
4486 /**
4487  * decodes a slice header.
4488  * this will allso call MPV_common_init() and frame_start() as needed
4489  */
4490 static int decode_slice_header(H264Context *h){
4491     MpegEncContext * const s = &h->s;
4492     unsigned int first_mb_in_slice;
4493     unsigned int pps_id;
4494     int num_ref_idx_active_override_flag;
4495     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4496     unsigned int slice_type, tmp;
4497     int default_ref_list_done = 0;
4498
4499     s->current_picture.reference= h->nal_ref_idc != 0;
4500     s->dropable= h->nal_ref_idc == 0;
4501
4502     first_mb_in_slice= get_ue_golomb(&s->gb);
4503
4504     slice_type= get_ue_golomb(&s->gb);
4505     if(slice_type > 9){
4506         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4507         return -1;
4508     }
4509     if(slice_type > 4){
4510         slice_type -= 5;
4511         h->slice_type_fixed=1;
4512     }else
4513         h->slice_type_fixed=0;
4514
4515     slice_type= slice_type_map[ slice_type ];
4516     if (slice_type == I_TYPE
4517         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4518         default_ref_list_done = 1;
4519     }
4520     h->slice_type= slice_type;
4521
4522     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4523
4524     pps_id= get_ue_golomb(&s->gb);
4525     if(pps_id>=MAX_PPS_COUNT){
4526         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4527         return -1;
4528     }
4529     h->pps= h->pps_buffer[pps_id];
4530     if(h->pps.slice_group_count == 0){
4531         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4532         return -1;
4533     }
4534
4535     h->sps= h->sps_buffer[ h->pps.sps_id ];
4536     if(h->sps.log2_max_frame_num == 0){
4537         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4538         return -1;
4539     }
4540
4541     if(h->dequant_coeff_pps != pps_id){
4542         h->dequant_coeff_pps = pps_id;
4543         init_dequant_tables(h);
4544     }
4545
4546     s->mb_width= h->sps.mb_width;
4547     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4548
4549     h->b_stride=  s->mb_width*4;
4550     h->b8_stride= s->mb_width*2;
4551
4552     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4553     if(h->sps.frame_mbs_only_flag)
4554         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4555     else
4556         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4557
4558     if (s->context_initialized
4559         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4560         free_tables(h);
4561         MPV_common_end(s);
4562     }
4563     if (!s->context_initialized) {
4564         if (MPV_common_init(s) < 0)
4565             return -1;
4566
4567         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4568             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4569             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4570         }else{
4571             int i;
4572             for(i=0; i<16; i++){
4573 #define T(x) (x>>2) | ((x<<2) & 0xF)
4574                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4575                 h-> field_scan[i] = T( field_scan[i]);
4576 #undef T
4577             }
4578         }
4579         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4580             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4581             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4582             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4583             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4584         }else{
4585             int i;
4586             for(i=0; i<64; i++){
4587 #define T(x) (x>>3) | ((x&7)<<3)
4588                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4589                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4590                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4591                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4592 #undef T
4593             }
4594         }
4595         if(h->sps.transform_bypass){ //FIXME same ugly
4596             h->zigzag_scan_q0          = zigzag_scan;
4597             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4598             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4599             h->field_scan_q0           = field_scan;
4600             h->field_scan8x8_q0        = field_scan8x8;
4601             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4602         }else{
4603             h->zigzag_scan_q0          = h->zigzag_scan;
4604             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4605             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4606             h->field_scan_q0           = h->field_scan;
4607             h->field_scan8x8_q0        = h->field_scan8x8;
4608             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4609         }
4610
4611         alloc_tables(h);
4612
4613         s->avctx->width = s->width;
4614         s->avctx->height = s->height;
4615         s->avctx->sample_aspect_ratio= h->sps.sar;
4616         if(!s->avctx->sample_aspect_ratio.den)
4617             s->avctx->sample_aspect_ratio.den = 1;
4618
4619         if(h->sps.timing_info_present_flag){
4620             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4621             if(h->x264_build > 0 && h->x264_build < 44)
4622                 s->avctx->time_base.den *= 2;
4623             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4624                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4625         }
4626     }
4627
4628     if(h->slice_num == 0){
4629         if(frame_start(h) < 0)
4630             return -1;
4631     }
4632
4633     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4634     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4635
4636     h->mb_mbaff = 0;
4637     h->mb_aff_frame = 0;
4638     if(h->sps.frame_mbs_only_flag){
4639         s->picture_structure= PICT_FRAME;
4640     }else{
4641         if(get_bits1(&s->gb)) { //field_pic_flag
4642             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4643             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4644         } else {
4645             s->picture_structure= PICT_FRAME;
4646             h->mb_aff_frame = h->sps.mb_aff;
4647         }
4648     }
4649     assert(s->mb_num == s->mb_width * s->mb_height);
4650     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4651        first_mb_in_slice                    >= s->mb_num){
4652         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4653         return -1;
4654     }
4655     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4656     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4657     assert(s->mb_y < s->mb_height);
4658
4659     if(s->picture_structure==PICT_FRAME){
4660         h->curr_pic_num=   h->frame_num;
4661         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4662     }else{
4663         h->curr_pic_num= 2*h->frame_num;
4664         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4665     }
4666
4667     if(h->nal_unit_type == NAL_IDR_SLICE){
4668         get_ue_golomb(&s->gb); /* idr_pic_id */
4669     }
4670
4671     if(h->sps.poc_type==0){
4672         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4673
4674         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4675             h->delta_poc_bottom= get_se_golomb(&s->gb);
4676         }
4677     }
4678
4679     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4680         h->delta_poc[0]= get_se_golomb(&s->gb);
4681
4682         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4683             h->delta_poc[1]= get_se_golomb(&s->gb);
4684     }
4685
4686     init_poc(h);
4687
4688     if(h->pps.redundant_pic_cnt_present){
4689         h->redundant_pic_count= get_ue_golomb(&s->gb);
4690     }
4691
4692     //set defaults, might be overriden a few line later
4693     h->ref_count[0]= h->pps.ref_count[0];
4694     h->ref_count[1]= h->pps.ref_count[1];
4695
4696     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4697         if(h->slice_type == B_TYPE){
4698             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4699             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4700                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4701         }
4702         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4703
4704         if(num_ref_idx_active_override_flag){
4705             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4706             if(h->slice_type==B_TYPE)
4707                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4708
4709             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4710                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4711                 h->ref_count[0]= h->ref_count[1]= 1;
4712                 return -1;
4713             }
4714         }
4715     }
4716
4717     if(!default_ref_list_done){
4718         fill_default_ref_list(h);
4719     }
4720
4721     if(decode_ref_pic_list_reordering(h) < 0)
4722         return -1;
4723
4724     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4725        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4726         pred_weight_table(h);
4727     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4728         implicit_weight_table(h);
4729     else
4730         h->use_weight = 0;
4731
4732     if(s->current_picture.reference)
4733         decode_ref_pic_marking(h);
4734
4735     if(FRAME_MBAFF)
4736         fill_mbaff_ref_list(h);
4737
4738     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4739         tmp = get_ue_golomb(&s->gb);
4740         if(tmp > 2){
4741             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4742             return -1;
4743         }
4744         h->cabac_init_idc= tmp;
4745     }
4746
4747     h->last_qscale_diff = 0;
4748     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4749     if(tmp>51){
4750         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4751         return -1;
4752     }
4753     s->qscale= tmp;
4754     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4755     //FIXME qscale / qp ... stuff
4756     if(h->slice_type == SP_TYPE){
4757         get_bits1(&s->gb); /* sp_for_switch_flag */
4758     }
4759     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4760         get_se_golomb(&s->gb); /* slice_qs_delta */
4761     }
4762
4763     h->deblocking_filter = 1;
4764     h->slice_alpha_c0_offset = 0;
4765     h->slice_beta_offset = 0;
4766     if( h->pps.deblocking_filter_parameters_present ) {
4767         tmp= get_ue_golomb(&s->gb);
4768         if(tmp > 2){
4769             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4770             return -1;
4771         }
4772         h->deblocking_filter= tmp;
4773         if(h->deblocking_filter < 2)
4774             h->deblocking_filter^= 1; // 1<->0
4775
4776         if( h->deblocking_filter ) {
4777             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4778             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4779         }
4780     }
4781     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4782        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4783        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4784        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4785         h->deblocking_filter= 0;
4786
4787 #if 0 //FMO
4788     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4789         slice_group_change_cycle= get_bits(&s->gb, ?);
4790 #endif
4791
4792     h->slice_num++;
4793
4794     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4795     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4796
4797     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4798         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4799                h->slice_num,
4800                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4801                first_mb_in_slice,
4802                av_get_pict_type_char(h->slice_type),
4803                pps_id, h->frame_num,
4804                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4805                h->ref_count[0], h->ref_count[1],
4806                s->qscale,
4807                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4808                h->use_weight,
4809                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4810                );
4811     }
4812
4813     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4814         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4815         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4816     }else{
4817         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4818         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4819     }
4820
4821     return 0;
4822 }
4823
4824 /**
4825  *
4826  */
4827 static inline int get_level_prefix(GetBitContext *gb){
4828     unsigned int buf;
4829     int log;
4830
4831     OPEN_READER(re, gb);
4832     UPDATE_CACHE(re, gb);
4833     buf=GET_CACHE(re, gb);
4834
4835     log= 32 - av_log2(buf);
4836 #ifdef TRACE
4837     print_bin(buf>>(32-log), log);
4838     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4839 #endif
4840
4841     LAST_SKIP_BITS(re, gb, log);
4842     CLOSE_READER(re, gb);
4843
4844     return log-1;
4845 }
4846
4847 static inline int get_dct8x8_allowed(H264Context *h){
4848     int i;
4849     for(i=0; i<4; i++){
4850         if(!IS_SUB_8X8(h->sub_mb_type[i])
4851            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4852             return 0;
4853     }
4854     return 1;
4855 }
4856
4857 /**
4858  * decodes a residual block.
4859  * @param n block index
4860  * @param scantable scantable
4861  * @param max_coeff number of coefficients in the block
4862  * @return <0 if an error occured
4863  */
4864 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4865     MpegEncContext * const s = &h->s;
4866     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4867     int level[16];
4868     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4869
4870     //FIXME put trailing_onex into the context
4871
4872     if(n == CHROMA_DC_BLOCK_INDEX){
4873         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4874         total_coeff= coeff_token>>2;
4875     }else{
4876         if(n == LUMA_DC_BLOCK_INDEX){
4877             total_coeff= pred_non_zero_count(h, 0);
4878             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4879             total_coeff= coeff_token>>2;
4880         }else{
4881             total_coeff= pred_non_zero_count(h, n);
4882             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4883             total_coeff= coeff_token>>2;
4884             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4885         }
4886     }
4887
4888     //FIXME set last_non_zero?
4889
4890     if(total_coeff==0)
4891         return 0;
4892     if(total_coeff > (unsigned)max_coeff) {
4893         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4894         return -1;
4895     }
4896
4897     trailing_ones= coeff_token&3;
4898     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4899     assert(total_coeff<=16);
4900
4901     for(i=0; i<trailing_ones; i++){
4902         level[i]= 1 - 2*get_bits1(gb);
4903     }
4904
4905     if(i<total_coeff) {
4906         int level_code, mask;
4907         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4908         int prefix= get_level_prefix(gb);
4909
4910         //first coefficient has suffix_length equal to 0 or 1
4911         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4912             if(suffix_length)
4913                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4914             else
4915                 level_code= (prefix<<suffix_length); //part
4916         }else if(prefix==14){
4917             if(suffix_length)
4918                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4919             else
4920                 level_code= prefix + get_bits(gb, 4); //part
4921         }else if(prefix==15){
4922             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4923             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4924         }else{
4925             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4926             return -1;
4927         }
4928
4929         if(trailing_ones < 3) level_code += 2;
4930
4931         suffix_length = 1;
4932         if(level_code > 5)
4933             suffix_length++;
4934         mask= -(level_code&1);
4935         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4936         i++;
4937
4938         //remaining coefficients have suffix_length > 0
4939         for(;i<total_coeff;i++) {
4940             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4941             prefix = get_level_prefix(gb);
4942             if(prefix<15){
4943                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4944             }else if(prefix==15){
4945                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4946             }else{
4947                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4948                 return -1;
4949             }
4950             mask= -(level_code&1);
4951             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4952             if(level_code > suffix_limit[suffix_length])
4953                 suffix_length++;
4954         }
4955     }
4956
4957     if(total_coeff == max_coeff)
4958         zeros_left=0;
4959     else{
4960         if(n == CHROMA_DC_BLOCK_INDEX)
4961             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4962         else
4963             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4964     }
4965
4966     coeff_num = zeros_left + total_coeff - 1;
4967     j = scantable[coeff_num];
4968     if(n > 24){
4969         block[j] = level[0];
4970         for(i=1;i<total_coeff;i++) {
4971             if(zeros_left <= 0)
4972                 run_before = 0;
4973             else if(zeros_left < 7){
4974                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4975             }else{
4976                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4977             }
4978             zeros_left -= run_before;
4979             coeff_num -= 1 + run_before;
4980             j= scantable[ coeff_num ];
4981
4982             block[j]= level[i];
4983         }
4984     }else{
4985         block[j] = (level[0] * qmul[j] + 32)>>6;
4986         for(i=1;i<total_coeff;i++) {
4987             if(zeros_left <= 0)
4988                 run_before = 0;
4989             else if(zeros_left < 7){
4990                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4991             }else{
4992                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4993             }
4994             zeros_left -= run_before;
4995             coeff_num -= 1 + run_before;
4996             j= scantable[ coeff_num ];
4997
4998             block[j]= (level[i] * qmul[j] + 32)>>6;
4999         }
5000     }
5001
5002     if(zeros_left<0){
5003         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5004         return -1;
5005     }
5006
5007     return 0;
5008 }
5009
5010 static void predict_field_decoding_flag(H264Context *h){
5011     MpegEncContext * const s = &h->s;
5012     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5013     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5014                 ? s->current_picture.mb_type[mb_xy-1]
5015                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5016                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
5017                 : 0;
5018     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5019 }
5020
5021 /**
5022  * decodes a P_SKIP or B_SKIP macroblock
5023  */
5024 static void decode_mb_skip(H264Context *h){
5025     MpegEncContext * const s = &h->s;
5026     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5027     int mb_type=0;
5028
5029     memset(h->non_zero_count[mb_xy], 0, 16);
5030     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5031
5032     if(MB_FIELD)
5033         mb_type|= MB_TYPE_INTERLACED;
5034
5035     if( h->slice_type == B_TYPE )
5036     {
5037         // just for fill_caches. pred_direct_motion will set the real mb_type
5038         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
5039
5040         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5041         pred_direct_motion(h, &mb_type);
5042         mb_type|= MB_TYPE_SKIP;
5043     }
5044     else
5045     {
5046         int mx, my;
5047         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5048
5049         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5050         pred_pskip_motion(h, &mx, &my);
5051         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5052         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5053     }
5054
5055     write_back_motion(h, mb_type);
5056     s->current_picture.mb_type[mb_xy]= mb_type;
5057     s->current_picture.qscale_table[mb_xy]= s->qscale;
5058     h->slice_table[ mb_xy ]= h->slice_num;
5059     h->prev_mb_skipped= 1;
5060 }
5061
5062 /**
5063  * decodes a macroblock
5064  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5065  */
5066 static int decode_mb_cavlc(H264Context *h){
5067     MpegEncContext * const s = &h->s;
5068     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5069     int partition_count;
5070     unsigned int mb_type, cbp;
5071     int dct8x8_allowed= h->pps.transform_8x8_mode;
5072
5073     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5074
5075     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5076     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5077                 down the code */
5078     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5079         if(s->mb_skip_run==-1)
5080             s->mb_skip_run= get_ue_golomb(&s->gb);
5081
5082         if (s->mb_skip_run--) {
5083             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5084                 if(s->mb_skip_run==0)
5085                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5086                 else
5087                     predict_field_decoding_flag(h);
5088             }
5089             decode_mb_skip(h);
5090             return 0;
5091         }
5092     }
5093     if(FRAME_MBAFF){
5094         if( (s->mb_y&1) == 0 )
5095             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5096     }else
5097         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5098
5099     h->prev_mb_skipped= 0;
5100
5101     mb_type= get_ue_golomb(&s->gb);
5102     if(h->slice_type == B_TYPE){
5103         if(mb_type < 23){
5104             partition_count= b_mb_type_info[mb_type].partition_count;
5105             mb_type=         b_mb_type_info[mb_type].type;
5106         }else{
5107             mb_type -= 23;
5108             goto decode_intra_mb;
5109         }
5110     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5111         if(mb_type < 5){
5112             partition_count= p_mb_type_info[mb_type].partition_count;
5113             mb_type=         p_mb_type_info[mb_type].type;
5114         }else{
5115             mb_type -= 5;
5116             goto decode_intra_mb;
5117         }
5118     }else{
5119        assert(h->slice_type == I_TYPE);
5120 decode_intra_mb:
5121         if(mb_type > 25){
5122             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5123             return -1;
5124         }
5125         partition_count=0;
5126         cbp= i_mb_type_info[mb_type].cbp;
5127         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5128         mb_type= i_mb_type_info[mb_type].type;
5129     }
5130
5131     if(MB_FIELD)
5132         mb_type |= MB_TYPE_INTERLACED;
5133
5134     h->slice_table[ mb_xy ]= h->slice_num;
5135
5136     if(IS_INTRA_PCM(mb_type)){
5137         unsigned int x, y;
5138
5139         // we assume these blocks are very rare so we dont optimize it
5140         align_get_bits(&s->gb);
5141
5142         // The pixels are stored in the same order as levels in h->mb array.
5143         for(y=0; y<16; y++){
5144             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5145             for(x=0; x<16; x++){
5146                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5147                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5148             }
5149         }
5150         for(y=0; y<8; y++){
5151             const int index= 256 + 4*(y&3) + 32*(y>>2);
5152             for(x=0; x<8; x++){
5153                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5154                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5155             }
5156         }
5157         for(y=0; y<8; y++){
5158             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5159             for(x=0; x<8; x++){
5160                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5161                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5162             }
5163         }
5164
5165         // In deblocking, the quantizer is 0
5166         s->current_picture.qscale_table[mb_xy]= 0;
5167         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5168         // All coeffs are present
5169         memset(h->non_zero_count[mb_xy], 16, 16);
5170
5171         s->current_picture.mb_type[mb_xy]= mb_type;
5172         return 0;
5173     }
5174
5175     if(MB_MBAFF){
5176         h->ref_count[0] <<= 1;
5177         h->ref_count[1] <<= 1;
5178     }
5179
5180     fill_caches(h, mb_type, 0);
5181
5182     //mb_pred
5183     if(IS_INTRA(mb_type)){
5184             int pred_mode;
5185 //            init_top_left_availability(h);
5186             if(IS_INTRA4x4(mb_type)){
5187                 int i;
5188                 int di = 1;
5189                 if(dct8x8_allowed && get_bits1(&s->gb)){
5190                     mb_type |= MB_TYPE_8x8DCT;
5191                     di = 4;
5192                 }
5193
5194 //                fill_intra4x4_pred_table(h);
5195                 for(i=0; i<16; i+=di){
5196                     int mode= pred_intra_mode(h, i);
5197
5198                     if(!get_bits1(&s->gb)){
5199                         const int rem_mode= get_bits(&s->gb, 3);
5200                         mode = rem_mode + (rem_mode >= mode);
5201                     }
5202
5203                     if(di==4)
5204                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5205                     else
5206                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5207                 }
5208                 write_back_intra_pred_mode(h);
5209                 if( check_intra4x4_pred_mode(h) < 0)
5210                     return -1;
5211             }else{
5212                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5213                 if(h->intra16x16_pred_mode < 0)
5214                     return -1;
5215             }
5216
5217             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
5218             if(pred_mode < 0)
5219                 return -1;
5220             h->chroma_pred_mode= pred_mode;
5221     }else if(partition_count==4){
5222         int i, j, sub_partition_count[4], list, ref[2][4];
5223
5224         if(h->slice_type == B_TYPE){
5225             for(i=0; i<4; i++){
5226                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5227                 if(h->sub_mb_type[i] >=13){
5228                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5229                     return -1;
5230                 }
5231                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5232                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5233             }
5234             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5235                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5236                 pred_direct_motion(h, &mb_type);
5237                 h->ref_cache[0][scan8[4]] =
5238                 h->ref_cache[1][scan8[4]] =
5239                 h->ref_cache[0][scan8[12]] =
5240                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5241             }
5242         }else{
5243             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5244             for(i=0; i<4; i++){
5245                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5246                 if(h->sub_mb_type[i] >=4){
5247                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5248                     return -1;
5249                 }
5250                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5251                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5252             }
5253         }
5254
5255         for(list=0; list<2; list++){
5256             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5257             if(ref_count == 0) continue;
5258             for(i=0; i<4; i++){
5259                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5260                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5261                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5262                     if(tmp>=ref_count){
5263                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
5264                         return -1;
5265                     }
5266                     ref[list][i]= tmp;
5267                 }else{
5268                  //FIXME
5269                     ref[list][i] = -1;
5270                 }
5271             }
5272         }
5273
5274         if(dct8x8_allowed)
5275             dct8x8_allowed = get_dct8x8_allowed(h);
5276
5277         for(list=0; list<2; list++){
5278             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5279             if(ref_count == 0) continue;
5280
5281             for(i=0; i<4; i++){
5282                 if(IS_DIRECT(h->sub_mb_type[i])) {
5283                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5284                     continue;
5285                 }
5286                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5287                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5288
5289                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5290                     const int sub_mb_type= h->sub_mb_type[i];
5291                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5292                     for(j=0; j<sub_partition_count[i]; j++){
5293                         int mx, my;
5294                         const int index= 4*i + block_width*j;
5295                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5296                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5297                         mx += get_se_golomb(&s->gb);
5298                         my += get_se_golomb(&s->gb);
5299                         tprintf("final mv:%d %d\n", mx, my);
5300
5301                         if(IS_SUB_8X8(sub_mb_type)){
5302                             mv_cache[ 1 ][0]=
5303                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5304                             mv_cache[ 1 ][1]=
5305                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5306                         }else if(IS_SUB_8X4(sub_mb_type)){
5307                             mv_cache[ 1 ][0]= mx;
5308                             mv_cache[ 1 ][1]= my;
5309                         }else if(IS_SUB_4X8(sub_mb_type)){
5310                             mv_cache[ 8 ][0]= mx;
5311                             mv_cache[ 8 ][1]= my;
5312                         }
5313                         mv_cache[ 0 ][0]= mx;
5314                         mv_cache[ 0 ][1]= my;
5315                     }
5316                 }else{
5317                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5318                     p[0] = p[1]=
5319                     p[8] = p[9]= 0;
5320                 }
5321             }
5322         }
5323     }else if(IS_DIRECT(mb_type)){
5324         pred_direct_motion(h, &mb_type);
5325         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5326     }else{
5327         int list, mx, my, i;
5328          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5329         if(IS_16X16(mb_type)){
5330             for(list=0; list<2; list++){
5331                 if(h->ref_count[list]>0){
5332                     if(IS_DIR(mb_type, 0, list)){
5333                         unsigned int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5334                         if(val >= h->ref_count[list]){
5335                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5336                             return -1;
5337                         }
5338                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5339                     }else
5340                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5341                 }
5342             }
5343             for(list=0; list<2; list++){
5344                 if(IS_DIR(mb_type, 0, list)){
5345                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5346                     mx += get_se_golomb(&s->gb);
5347                     my += get_se_golomb(&s->gb);
5348                     tprintf("final mv:%d %d\n", mx, my);
5349
5350                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5351                 }else
5352                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5353             }
5354         }
5355         else if(IS_16X8(mb_type)){
5356             for(list=0; list<2; list++){
5357                 if(h->ref_count[list]>0){
5358                     for(i=0; i<2; i++){
5359                         if(IS_DIR(mb_type, i, list)){
5360                             unsigned int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5361                             if(val >= h->ref_count[list]){
5362                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5363                                 return -1;
5364                             }
5365                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5366                         }else
5367                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5368                     }
5369                 }
5370             }
5371             for(list=0; list<2; list++){
5372                 for(i=0; i<2; i++){
5373                     if(IS_DIR(mb_type, i, list)){
5374                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5375                         mx += get_se_golomb(&s->gb);
5376                         my += get_se_golomb(&s->gb);
5377                         tprintf("final mv:%d %d\n", mx, my);
5378
5379                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5380                     }else
5381                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5382                 }
5383             }
5384         }else{
5385             assert(IS_8X16(mb_type));
5386             for(list=0; list<2; list++){
5387                 if(h->ref_count[list]>0){
5388                     for(i=0; i<2; i++){
5389                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5390                             unsigned int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5391                             if(val >= h->ref_count[list]){
5392                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5393                                 return -1;
5394                             }
5395                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5396                         }else
5397                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5398                     }
5399                 }
5400             }
5401             for(list=0; list<2; list++){
5402                 for(i=0; i<2; i++){
5403                     if(IS_DIR(mb_type, i, list)){
5404                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5405                         mx += get_se_golomb(&s->gb);
5406                         my += get_se_golomb(&s->gb);
5407                         tprintf("final mv:%d %d\n", mx, my);
5408
5409                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5410                     }else
5411                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5412                 }
5413             }
5414         }
5415     }
5416
5417     if(IS_INTER(mb_type))
5418         write_back_motion(h, mb_type);
5419
5420     if(!IS_INTRA16x16(mb_type)){
5421         cbp= get_ue_golomb(&s->gb);
5422         if(cbp > 47){
5423             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5424             return -1;
5425         }
5426
5427         if(IS_INTRA4x4(mb_type))
5428             cbp= golomb_to_intra4x4_cbp[cbp];
5429         else
5430             cbp= golomb_to_inter_cbp[cbp];
5431     }
5432     h->cbp = cbp;
5433
5434     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5435         if(get_bits1(&s->gb))
5436             mb_type |= MB_TYPE_8x8DCT;
5437     }
5438     s->current_picture.mb_type[mb_xy]= mb_type;
5439
5440     if(cbp || IS_INTRA16x16(mb_type)){
5441         int i8x8, i4x4, chroma_idx;
5442         int chroma_qp, dquant;
5443         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5444         const uint8_t *scan, *scan8x8, *dc_scan;
5445
5446 //        fill_non_zero_count_cache(h);
5447
5448         if(IS_INTERLACED(mb_type)){
5449             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5450             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5451             dc_scan= luma_dc_field_scan;
5452         }else{
5453             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5454             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5455             dc_scan= luma_dc_zigzag_scan;
5456         }
5457
5458         dquant= get_se_golomb(&s->gb);
5459
5460         if( dquant > 25 || dquant < -26 ){
5461             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5462             return -1;
5463         }
5464
5465         s->qscale += dquant;
5466         if(((unsigned)s->qscale) > 51){
5467             if(s->qscale<0) s->qscale+= 52;
5468             else            s->qscale-= 52;
5469         }
5470
5471         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5472         if(IS_INTRA16x16(mb_type)){
5473             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5474                 return -1; //FIXME continue if partitioned and other return -1 too
5475             }
5476
5477             assert((cbp&15) == 0 || (cbp&15) == 15);
5478
5479             if(cbp&15){
5480                 for(i8x8=0; i8x8<4; i8x8++){
5481                     for(i4x4=0; i4x4<4; i4x4++){
5482                         const int index= i4x4 + 4*i8x8;
5483                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5484                             return -1;
5485                         }
5486                     }
5487                 }
5488             }else{
5489                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5490             }
5491         }else{
5492             for(i8x8=0; i8x8<4; i8x8++){
5493                 if(cbp & (1<<i8x8)){
5494                     if(IS_8x8DCT(mb_type)){
5495                         DCTELEM *buf = &h->mb[64*i8x8];
5496                         uint8_t *nnz;
5497                         for(i4x4=0; i4x4<4; i4x4++){
5498                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5499                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5500                                 return -1;
5501                         }
5502                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5503                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5504                     }else{
5505                         for(i4x4=0; i4x4<4; i4x4++){
5506                             const int index= i4x4 + 4*i8x8;
5507
5508                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5509                                 return -1;
5510                             }
5511                         }
5512                     }
5513                 }else{
5514                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5515                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5516                 }
5517             }
5518         }
5519
5520         if(cbp&0x30){
5521             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5522                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5523                     return -1;
5524                 }
5525         }
5526
5527         if(cbp&0x20){
5528             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5529                 for(i4x4=0; i4x4<4; i4x4++){
5530                     const int index= 16 + 4*chroma_idx + i4x4;
5531                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5532                         return -1;
5533                     }
5534                 }
5535             }
5536         }else{
5537             uint8_t * const nnz= &h->non_zero_count_cache[0];
5538             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5539             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5540         }
5541     }else{
5542         uint8_t * const nnz= &h->non_zero_count_cache[0];
5543         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5544         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5545         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5546     }
5547     s->current_picture.qscale_table[mb_xy]= s->qscale;
5548     write_back_non_zero_count(h);
5549
5550     if(MB_MBAFF){
5551         h->ref_count[0] >>= 1;
5552         h->ref_count[1] >>= 1;
5553     }
5554
5555     return 0;
5556 }
5557
5558 static int decode_cabac_field_decoding_flag(H264Context *h) {
5559     MpegEncContext * const s = &h->s;
5560     const int mb_x = s->mb_x;
5561     const int mb_y = s->mb_y & ~1;
5562     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5563     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5564
5565     unsigned int ctx = 0;
5566
5567     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5568         ctx += 1;
5569     }
5570     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5571         ctx += 1;
5572     }
5573
5574     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5575 }
5576
5577 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5578     uint8_t *state= &h->cabac_state[ctx_base];
5579     int mb_type;
5580
5581     if(intra_slice){
5582         MpegEncContext * const s = &h->s;
5583         const int mba_xy = h->left_mb_xy[0];
5584         const int mbb_xy = h->top_mb_xy;
5585         int ctx=0;
5586         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5587             ctx++;
5588         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5589             ctx++;
5590         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5591             return 0;   /* I4x4 */
5592         state += 2;
5593     }else{
5594         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5595             return 0;   /* I4x4 */
5596     }
5597
5598     if( get_cabac_terminate( &h->cabac ) )
5599         return 25;  /* PCM */
5600
5601     mb_type = 1; /* I16x16 */
5602     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5603     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5604         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5605     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5606     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5607     return mb_type;
5608 }
5609
5610 static int decode_cabac_mb_type( H264Context *h ) {
5611     MpegEncContext * const s = &h->s;
5612
5613     if( h->slice_type == I_TYPE ) {
5614         return decode_cabac_intra_mb_type(h, 3, 1);
5615     } else if( h->slice_type == P_TYPE ) {
5616         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5617             /* P-type */
5618             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5619                 /* P_L0_D16x16, P_8x8 */
5620                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5621             } else {
5622                 /* P_L0_D8x16, P_L0_D16x8 */
5623                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5624             }
5625         } else {
5626             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5627         }
5628     } else if( h->slice_type == B_TYPE ) {
5629         const int mba_xy = h->left_mb_xy[0];
5630         const int mbb_xy = h->top_mb_xy;
5631         int ctx = 0;
5632         int bits;
5633
5634         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5635             ctx++;
5636         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5637             ctx++;
5638
5639         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5640             return 0; /* B_Direct_16x16 */
5641
5642         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5643             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5644         }
5645
5646         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5647         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5648         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5649         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5650         if( bits < 8 )
5651             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5652         else if( bits == 13 ) {
5653             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5654         } else if( bits == 14 )
5655             return 11; /* B_L1_L0_8x16 */
5656         else if( bits == 15 )
5657             return 22; /* B_8x8 */
5658
5659         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5660         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5661     } else {
5662         /* TODO SI/SP frames? */
5663         return -1;
5664     }
5665 }
5666
5667 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5668     MpegEncContext * const s = &h->s;
5669     int mba_xy, mbb_xy;
5670     int ctx = 0;
5671
5672     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5673         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5674         mba_xy = mb_xy - 1;
5675         if( (mb_y&1)
5676             && h->slice_table[mba_xy] == h->slice_num
5677             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5678             mba_xy += s->mb_stride;
5679         if( MB_FIELD ){
5680             mbb_xy = mb_xy - s->mb_stride;
5681             if( !(mb_y&1)
5682                 && h->slice_table[mbb_xy] == h->slice_num
5683                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5684                 mbb_xy -= s->mb_stride;
5685         }else
5686             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5687     }else{
5688         int mb_xy = mb_x + mb_y*s->mb_stride;
5689         mba_xy = mb_xy - 1;
5690         mbb_xy = mb_xy - s->mb_stride;
5691     }
5692
5693     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5694         ctx++;
5695     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5696         ctx++;
5697
5698     if( h->slice_type == B_TYPE )
5699         ctx += 13;
5700     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5701 }
5702
5703 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5704     int mode = 0;
5705
5706     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5707         return pred_mode;
5708
5709     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5710     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5711     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5712
5713     if( mode >= pred_mode )
5714         return mode + 1;
5715     else
5716         return mode;
5717 }
5718
5719 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5720     const int mba_xy = h->left_mb_xy[0];
5721     const int mbb_xy = h->top_mb_xy;
5722
5723     int ctx = 0;
5724
5725     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5726     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5727         ctx++;
5728
5729     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5730         ctx++;
5731
5732     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5733         return 0;
5734
5735     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5736         return 1;
5737     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5738         return 2;
5739     else
5740         return 3;
5741 }
5742
5743 static const uint8_t block_idx_x[16] = {
5744     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5745 };
5746 static const uint8_t block_idx_y[16] = {
5747     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5748 };
5749 static const uint8_t block_idx_xy[4][4] = {
5750     { 0, 2, 8,  10},
5751     { 1, 3, 9,  11},
5752     { 4, 6, 12, 14},
5753     { 5, 7, 13, 15}
5754 };
5755
5756 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5757     int cbp = 0;
5758     int cbp_b = -1;
5759     int i8x8;
5760
5761     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5762         cbp_b = h->top_cbp;
5763         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5764     }
5765
5766     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5767         int cbp_a = -1;
5768         int x, y;
5769         int ctx = 0;
5770
5771         x = block_idx_x[4*i8x8];
5772         y = block_idx_y[4*i8x8];
5773
5774         if( x > 0 )
5775             cbp_a = cbp;
5776         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5777             cbp_a = h->left_cbp;
5778             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5779         }
5780
5781         if( y > 0 )
5782             cbp_b = cbp;
5783
5784         /* No need to test for skip as we put 0 for skip block */
5785         /* No need to test for IPCM as we put 1 for IPCM block */
5786         if( cbp_a >= 0 ) {
5787             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5788             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5789                 ctx++;
5790         }
5791
5792         if( cbp_b >= 0 ) {
5793             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5794             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5795                 ctx += 2;
5796         }
5797
5798         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5799             cbp |= 1 << i8x8;
5800         }
5801     }
5802     return cbp;
5803 }
5804 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5805     int ctx;
5806     int cbp_a, cbp_b;
5807
5808     cbp_a = (h->left_cbp>>4)&0x03;
5809     cbp_b = (h-> top_cbp>>4)&0x03;
5810
5811     ctx = 0;
5812     if( cbp_a > 0 ) ctx++;
5813     if( cbp_b > 0 ) ctx += 2;
5814     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5815         return 0;
5816
5817     ctx = 4;
5818     if( cbp_a == 2 ) ctx++;
5819     if( cbp_b == 2 ) ctx += 2;
5820     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5821 }
5822 static int decode_cabac_mb_dqp( H264Context *h) {
5823     MpegEncContext * const s = &h->s;
5824     int mbn_xy;
5825     int   ctx = 0;
5826     int   val = 0;
5827
5828     if( s->mb_x > 0 )
5829         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5830     else
5831         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5832
5833     if( h->last_qscale_diff != 0 )
5834         ctx++;
5835
5836     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5837         if( ctx < 2 )
5838             ctx = 2;
5839         else
5840             ctx = 3;
5841         val++;
5842         if(val > 102) //prevent infinite loop
5843             return INT_MIN;
5844     }
5845
5846     if( val&0x01 )
5847         return (val + 1)/2;
5848     else
5849         return -(val + 1)/2;
5850 }
5851 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5852     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5853         return 0;   /* 8x8 */
5854     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5855         return 1;   /* 8x4 */
5856     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5857         return 2;   /* 4x8 */
5858     return 3;       /* 4x4 */
5859 }
5860 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5861     int type;
5862     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5863         return 0;   /* B_Direct_8x8 */
5864     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5865         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5866     type = 3;
5867     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5868         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5869             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5870         type += 4;
5871     }
5872     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5873     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5874     return type;
5875 }
5876
5877 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5878     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5879 }
5880
5881 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5882     int refa = h->ref_cache[list][scan8[n] - 1];
5883     int refb = h->ref_cache[list][scan8[n] - 8];
5884     int ref  = 0;
5885     int ctx  = 0;
5886
5887     if( h->slice_type == B_TYPE) {
5888         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5889             ctx++;
5890         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5891             ctx += 2;
5892     } else {
5893         if( refa > 0 )
5894             ctx++;
5895         if( refb > 0 )
5896             ctx += 2;
5897     }
5898
5899     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5900         ref++;
5901         if( ctx < 4 )
5902             ctx = 4;
5903         else
5904             ctx = 5;
5905         if(ref >= 32 /*h->ref_list[list]*/){
5906             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5907             return 0; //FIXME we should return -1 and check the return everywhere
5908         }
5909     }
5910     return ref;
5911 }
5912
5913 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5914     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5915                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5916     int ctxbase = (l == 0) ? 40 : 47;
5917     int ctx, mvd;
5918
5919     if( amvd < 3 )
5920         ctx = 0;
5921     else if( amvd > 32 )
5922         ctx = 2;
5923     else
5924         ctx = 1;
5925
5926     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5927         return 0;
5928
5929     mvd= 1;
5930     ctx= 3;
5931     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5932         mvd++;
5933         if( ctx < 6 )
5934             ctx++;
5935     }
5936
5937     if( mvd >= 9 ) {
5938         int k = 3;
5939         while( get_cabac_bypass( &h->cabac ) ) {
5940             mvd += 1 << k;
5941             k++;
5942             if(k>24){
5943                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5944                 return INT_MIN;
5945             }
5946         }
5947         while( k-- ) {
5948             if( get_cabac_bypass( &h->cabac ) )
5949                 mvd += 1 << k;
5950         }
5951     }
5952     return get_cabac_bypass_sign( &h->cabac, -mvd );
5953 }
5954
5955 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5956     int nza, nzb;
5957     int ctx = 0;
5958
5959     if( cat == 0 ) {
5960         nza = h->left_cbp&0x100;
5961         nzb = h-> top_cbp&0x100;
5962     } else if( cat == 1 || cat == 2 ) {
5963         nza = h->non_zero_count_cache[scan8[idx] - 1];
5964         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5965     } else if( cat == 3 ) {
5966         nza = (h->left_cbp>>(6+idx))&0x01;
5967         nzb = (h-> top_cbp>>(6+idx))&0x01;
5968     } else {
5969         assert(cat == 4);
5970         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5971         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5972     }
5973
5974     if( nza > 0 )
5975         ctx++;
5976
5977     if( nzb > 0 )
5978         ctx += 2;
5979
5980     return ctx + 4 * cat;
5981 }
5982
5983 static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
5984     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5985     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5986     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5987     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5988 };
5989
5990 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5991     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5992     static const int significant_coeff_flag_offset[2][6] = {
5993       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5994       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5995     };
5996     static const int last_coeff_flag_offset[2][6] = {
5997       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5998       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5999     };
6000     static const int coeff_abs_level_m1_offset[6] = {
6001         227+0, 227+10, 227+20, 227+30, 227+39, 426
6002     };
6003     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
6004       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6005         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6006         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6007        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6008       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6009         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6010         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6011         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6012     };
6013
6014     int index[64];
6015
6016     int last;
6017     int coeff_count = 0;
6018
6019     int abslevel1 = 1;
6020     int abslevelgt1 = 0;
6021
6022     uint8_t *significant_coeff_ctx_base;
6023     uint8_t *last_coeff_ctx_base;
6024     uint8_t *abs_level_m1_ctx_base;
6025
6026 #ifndef ARCH_X86
6027 #define CABAC_ON_STACK
6028 #endif
6029 #ifdef CABAC_ON_STACK
6030 #define CC &cc
6031     CABACContext cc;
6032     cc.range     = h->cabac.range;
6033     cc.low       = h->cabac.low;
6034     cc.bytestream= h->cabac.bytestream;
6035 #else
6036 #define CC &h->cabac
6037 #endif
6038
6039
6040     /* cat: 0-> DC 16x16  n = 0
6041      *      1-> AC 16x16  n = luma4x4idx
6042      *      2-> Luma4x4   n = luma4x4idx
6043      *      3-> DC Chroma n = iCbCr
6044      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6045      *      5-> Luma8x8   n = 4 * luma8x8idx
6046      */
6047
6048     /* read coded block flag */
6049     if( cat != 5 ) {
6050         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6051             if( cat == 1 || cat == 2 )
6052                 h->non_zero_count_cache[scan8[n]] = 0;
6053             else if( cat == 4 )
6054                 h->non_zero_count_cache[scan8[16+n]] = 0;
6055 #ifdef CABAC_ON_STACK
6056             h->cabac.range     = cc.range     ;
6057             h->cabac.low       = cc.low       ;
6058             h->cabac.bytestream= cc.bytestream;
6059 #endif
6060             return 0;
6061         }
6062     }
6063
6064     significant_coeff_ctx_base = h->cabac_state
6065         + significant_coeff_flag_offset[MB_FIELD][cat];
6066     last_coeff_ctx_base = h->cabac_state
6067         + last_coeff_flag_offset[MB_FIELD][cat];
6068     abs_level_m1_ctx_base = h->cabac_state
6069         + coeff_abs_level_m1_offset[cat];
6070
6071     if( cat == 5 ) {
6072 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6073         for(last= 0; last < coefs; last++) { \
6074             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6075             if( get_cabac( CC, sig_ctx )) { \
6076                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6077                 index[coeff_count++] = last; \
6078                 if( get_cabac( CC, last_ctx ) ) { \
6079                     last= max_coeff; \
6080                     break; \
6081                 } \
6082             } \
6083         }\
6084         if( last == max_coeff -1 ) {\
6085             index[coeff_count++] = last;\
6086         }
6087         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6088 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
6089         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
6090     } else {
6091         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
6092 #else
6093         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6094     } else {
6095         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6096 #endif
6097     }
6098     assert(coeff_count > 0);
6099
6100     if( cat == 0 )
6101         h->cbp_table[mb_xy] |= 0x100;
6102     else if( cat == 1 || cat == 2 )
6103         h->non_zero_count_cache[scan8[n]] = coeff_count;
6104     else if( cat == 3 )
6105         h->cbp_table[mb_xy] |= 0x40 << n;
6106     else if( cat == 4 )
6107         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6108     else {
6109         assert( cat == 5 );
6110         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6111     }
6112
6113     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
6114         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6115         int j= scantable[index[coeff_count]];
6116
6117         if( get_cabac( CC, ctx ) == 0 ) {
6118             if( !qmul ) {
6119                 block[j] = get_cabac_bypass_sign( CC, -1);
6120             }else{
6121                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
6122             }
6123
6124             abslevel1++;
6125         } else {
6126             int coeff_abs = 2;
6127             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6128             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
6129                 coeff_abs++;
6130             }
6131
6132             if( coeff_abs >= 15 ) {
6133                 int j = 0;
6134                 while( get_cabac_bypass( CC ) ) {
6135                     j++;
6136                 }
6137
6138                 coeff_abs=1;
6139                 while( j-- ) {
6140                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
6141                 }
6142                 coeff_abs+= 14;
6143             }
6144
6145             if( !qmul ) {
6146                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
6147                 else                                block[j] =  coeff_abs;
6148             }else{
6149                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6150                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6151             }
6152
6153             abslevelgt1++;
6154         }
6155     }
6156 #ifdef CABAC_ON_STACK
6157             h->cabac.range     = cc.range     ;
6158             h->cabac.low       = cc.low       ;
6159             h->cabac.bytestream= cc.bytestream;
6160 #endif
6161     return 0;
6162 }
6163
6164 static void inline compute_mb_neighbors(H264Context *h)
6165 {
6166     MpegEncContext * const s = &h->s;
6167     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6168     h->top_mb_xy     = mb_xy - s->mb_stride;
6169     h->left_mb_xy[0] = mb_xy - 1;
6170     if(FRAME_MBAFF){
6171         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6172         const int top_pair_xy      = pair_xy     - s->mb_stride;
6173         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6174         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6175         const int curr_mb_frame_flag = !MB_FIELD;
6176         const int bottom = (s->mb_y & 1);
6177         if (bottom
6178                 ? !curr_mb_frame_flag // bottom macroblock
6179                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6180                 ) {
6181             h->top_mb_xy -= s->mb_stride;
6182         }
6183         if (left_mb_frame_flag != curr_mb_frame_flag) {
6184             h->left_mb_xy[0] = pair_xy - 1;
6185         }
6186     }
6187     return;
6188 }
6189
6190 /**
6191  * decodes a macroblock
6192  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6193  */
6194 static int decode_mb_cabac(H264Context *h) {
6195     MpegEncContext * const s = &h->s;
6196     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6197     int mb_type, partition_count, cbp = 0;
6198     int dct8x8_allowed= h->pps.transform_8x8_mode;
6199
6200     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6201
6202     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6203     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6204         int skip;
6205         /* a skipped mb needs the aff flag from the following mb */
6206         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6207             predict_field_decoding_flag(h);
6208         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6209             skip = h->next_mb_skipped;
6210         else
6211             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6212         /* read skip flags */
6213         if( skip ) {
6214             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6215                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6216                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6217                 if(h->next_mb_skipped)
6218                     predict_field_decoding_flag(h);
6219                 else
6220                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6221             }
6222
6223             decode_mb_skip(h);
6224
6225             h->cbp_table[mb_xy] = 0;
6226             h->chroma_pred_mode_table[mb_xy] = 0;
6227             h->last_qscale_diff = 0;
6228
6229             return 0;
6230
6231         }
6232     }
6233     if(FRAME_MBAFF){
6234         if( (s->mb_y&1) == 0 )
6235             h->mb_mbaff =
6236             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6237     }else
6238         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6239
6240     h->prev_mb_skipped = 0;
6241
6242     compute_mb_neighbors(h);
6243     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6244         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6245         return -1;
6246     }
6247
6248     if( h->slice_type == B_TYPE ) {
6249         if( mb_type < 23 ){
6250             partition_count= b_mb_type_info[mb_type].partition_count;
6251             mb_type=         b_mb_type_info[mb_type].type;
6252         }else{
6253             mb_type -= 23;
6254             goto decode_intra_mb;
6255         }
6256     } else if( h->slice_type == P_TYPE ) {
6257         if( mb_type < 5) {
6258             partition_count= p_mb_type_info[mb_type].partition_count;
6259             mb_type=         p_mb_type_info[mb_type].type;
6260         } else {
6261             mb_type -= 5;
6262             goto decode_intra_mb;
6263         }
6264     } else {
6265        assert(h->slice_type == I_TYPE);
6266 decode_intra_mb:
6267         partition_count = 0;
6268         cbp= i_mb_type_info[mb_type].cbp;
6269         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6270         mb_type= i_mb_type_info[mb_type].type;
6271     }
6272     if(MB_FIELD)
6273         mb_type |= MB_TYPE_INTERLACED;
6274
6275     h->slice_table[ mb_xy ]= h->slice_num;
6276
6277     if(IS_INTRA_PCM(mb_type)) {
6278         const uint8_t *ptr;
6279         unsigned int x, y;
6280
6281         // We assume these blocks are very rare so we dont optimize it.
6282         // FIXME The two following lines get the bitstream position in the cabac
6283         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6284         ptr= h->cabac.bytestream;
6285         if(h->cabac.low&0x1) ptr--;
6286         if(CABAC_BITS==16){
6287             if(h->cabac.low&0x1FF) ptr--;
6288         }
6289
6290         // The pixels are stored in the same order as levels in h->mb array.
6291         for(y=0; y<16; y++){
6292             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6293             for(x=0; x<16; x++){
6294                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6295                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6296             }
6297         }
6298         for(y=0; y<8; y++){
6299             const int index= 256 + 4*(y&3) + 32*(y>>2);
6300             for(x=0; x<8; x++){
6301                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6302                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6303             }
6304         }
6305         for(y=0; y<8; y++){
6306             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6307             for(x=0; x<8; x++){
6308                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6309                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6310             }
6311         }
6312
6313         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6314
6315         // All blocks are present
6316         h->cbp_table[mb_xy] = 0x1ef;
6317         h->chroma_pred_mode_table[mb_xy] = 0;
6318         // In deblocking, the quantizer is 0
6319         s->current_picture.qscale_table[mb_xy]= 0;
6320         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6321         // All coeffs are present
6322         memset(h->non_zero_count[mb_xy], 16, 16);
6323         s->current_picture.mb_type[mb_xy]= mb_type;
6324         return 0;
6325     }
6326
6327     if(MB_MBAFF){
6328         h->ref_count[0] <<= 1;
6329         h->ref_count[1] <<= 1;
6330     }
6331
6332     fill_caches(h, mb_type, 0);
6333
6334     if( IS_INTRA( mb_type ) ) {
6335         int i, pred_mode;
6336         if( IS_INTRA4x4( mb_type ) ) {
6337             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6338                 mb_type |= MB_TYPE_8x8DCT;
6339                 for( i = 0; i < 16; i+=4 ) {
6340                     int pred = pred_intra_mode( h, i );
6341                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6342                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6343                 }
6344             } else {
6345                 for( i = 0; i < 16; i++ ) {
6346                     int pred = pred_intra_mode( h, i );
6347                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6348
6349                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6350                 }
6351             }
6352             write_back_intra_pred_mode(h);
6353             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6354         } else {
6355             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6356             if( h->intra16x16_pred_mode < 0 ) return -1;
6357         }
6358         h->chroma_pred_mode_table[mb_xy] =
6359         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6360
6361         pred_mode= check_intra_pred_mode( h, pred_mode );
6362         if( pred_mode < 0 ) return -1;
6363         h->chroma_pred_mode= pred_mode;
6364     } else if( partition_count == 4 ) {
6365         int i, j, sub_partition_count[4], list, ref[2][4];
6366
6367         if( h->slice_type == B_TYPE ) {
6368             for( i = 0; i < 4; i++ ) {
6369                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6370                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6371                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6372             }
6373             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6374                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6375                 pred_direct_motion(h, &mb_type);
6376                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6377                     for( i = 0; i < 4; i++ )
6378                         if( IS_DIRECT(h->sub_mb_type[i]) )
6379                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6380                 }
6381             }
6382         } else {
6383             for( i = 0; i < 4; i++ ) {
6384                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6385                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6386                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6387             }
6388         }
6389
6390         for( list = 0; list < 2; list++ ) {
6391             if( h->ref_count[list] > 0 ) {
6392                 for( i = 0; i < 4; i++ ) {
6393                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6394                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6395                         if( h->ref_count[list] > 1 )
6396                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6397                         else
6398                             ref[list][i] = 0;
6399                     } else {
6400                         ref[list][i] = -1;
6401                     }
6402                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6403                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6404                 }
6405             }
6406         }
6407
6408         if(dct8x8_allowed)
6409             dct8x8_allowed = get_dct8x8_allowed(h);
6410
6411         for(list=0; list<2; list++){
6412             for(i=0; i<4; i++){
6413                 if(IS_DIRECT(h->sub_mb_type[i])){
6414                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6415                     continue;
6416                 }
6417                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6418
6419                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6420                     const int sub_mb_type= h->sub_mb_type[i];
6421                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6422                     for(j=0; j<sub_partition_count[i]; j++){
6423                         int mpx, mpy;
6424                         int mx, my;
6425                         const int index= 4*i + block_width*j;
6426                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6427                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6428                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6429
6430                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6431                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6432                         tprintf("final mv:%d %d\n", mx, my);
6433
6434                         if(IS_SUB_8X8(sub_mb_type)){
6435                             mv_cache[ 1 ][0]=
6436                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6437                             mv_cache[ 1 ][1]=
6438                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6439
6440                             mvd_cache[ 1 ][0]=
6441                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6442                             mvd_cache[ 1 ][1]=
6443                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6444                         }else if(IS_SUB_8X4(sub_mb_type)){
6445                             mv_cache[ 1 ][0]= mx;
6446                             mv_cache[ 1 ][1]= my;
6447
6448                             mvd_cache[ 1 ][0]= mx - mpx;
6449                             mvd_cache[ 1 ][1]= my - mpy;
6450                         }else if(IS_SUB_4X8(sub_mb_type)){
6451                             mv_cache[ 8 ][0]= mx;
6452                             mv_cache[ 8 ][1]= my;
6453
6454                             mvd_cache[ 8 ][0]= mx - mpx;
6455                             mvd_cache[ 8 ][1]= my - mpy;
6456                         }
6457                         mv_cache[ 0 ][0]= mx;
6458                         mv_cache[ 0 ][1]= my;
6459
6460                         mvd_cache[ 0 ][0]= mx - mpx;
6461                         mvd_cache[ 0 ][1]= my - mpy;
6462                     }
6463                 }else{
6464                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6465                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6466                     p[0] = p[1] = p[8] = p[9] = 0;
6467                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6468                 }
6469             }
6470         }
6471     } else if( IS_DIRECT(mb_type) ) {
6472         pred_direct_motion(h, &mb_type);
6473         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6474         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6475         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6476     } else {
6477         int list, mx, my, i, mpx, mpy;
6478         if(IS_16X16(mb_type)){
6479             for(list=0; list<2; list++){
6480                 if(IS_DIR(mb_type, 0, list)){
6481                     if(h->ref_count[list] > 0 ){
6482                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6483                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6484                     }
6485                 }else
6486                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6487             }
6488             for(list=0; list<2; list++){
6489                 if(IS_DIR(mb_type, 0, list)){
6490                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6491
6492                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6493                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6494                     tprintf("final mv:%d %d\n", mx, my);
6495
6496                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6497                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6498                 }else
6499                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6500             }
6501         }
6502         else if(IS_16X8(mb_type)){
6503             for(list=0; list<2; list++){
6504                 if(h->ref_count[list]>0){
6505                     for(i=0; i<2; i++){
6506                         if(IS_DIR(mb_type, i, list)){
6507                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6508                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6509                         }else
6510                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6511                     }
6512                 }
6513             }
6514             for(list=0; list<2; list++){
6515                 for(i=0; i<2; i++){
6516                     if(IS_DIR(mb_type, i, list)){
6517                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6518                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6519                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6520                         tprintf("final mv:%d %d\n", mx, my);
6521
6522                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6523                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6524                     }else{
6525                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6526                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6527                     }
6528                 }
6529             }
6530         }else{
6531             assert(IS_8X16(mb_type));
6532             for(list=0; list<2; list++){
6533                 if(h->ref_count[list]>0){
6534                     for(i=0; i<2; i++){
6535                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6536                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6537                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6538                         }else
6539                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6540                     }
6541                 }
6542             }
6543             for(list=0; list<2; list++){
6544                 for(i=0; i<2; i++){
6545                     if(IS_DIR(mb_type, i, list)){
6546                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6547                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6548                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6549
6550                         tprintf("final mv:%d %d\n", mx, my);
6551                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6552                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6553                     }else{
6554                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6555                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6556                     }
6557                 }
6558             }
6559         }
6560     }
6561
6562    if( IS_INTER( mb_type ) ) {
6563         h->chroma_pred_mode_table[mb_xy] = 0;
6564         write_back_motion( h, mb_type );
6565    }
6566
6567     if( !IS_INTRA16x16( mb_type ) ) {
6568         cbp  = decode_cabac_mb_cbp_luma( h );
6569         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6570     }
6571
6572     h->cbp_table[mb_xy] = h->cbp = cbp;
6573
6574     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6575         if( decode_cabac_mb_transform_size( h ) )
6576             mb_type |= MB_TYPE_8x8DCT;
6577     }
6578     s->current_picture.mb_type[mb_xy]= mb_type;
6579
6580     if( cbp || IS_INTRA16x16( mb_type ) ) {
6581         const uint8_t *scan, *scan8x8, *dc_scan;
6582         int dqp;
6583
6584         if(IS_INTERLACED(mb_type)){
6585             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6586             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6587             dc_scan= luma_dc_field_scan;
6588         }else{
6589             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6590             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6591             dc_scan= luma_dc_zigzag_scan;
6592         }
6593
6594         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6595         if( dqp == INT_MIN ){
6596             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6597             return -1;
6598         }
6599         s->qscale += dqp;
6600         if(((unsigned)s->qscale) > 51){
6601             if(s->qscale<0) s->qscale+= 52;
6602             else            s->qscale-= 52;
6603         }
6604         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6605
6606         if( IS_INTRA16x16( mb_type ) ) {
6607             int i;
6608             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6609             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6610                 return -1;
6611             if( cbp&15 ) {
6612                 for( i = 0; i < 16; i++ ) {
6613                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6614                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6615                         return -1;
6616                 }
6617             } else {
6618                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6619             }
6620         } else {
6621             int i8x8, i4x4;
6622             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6623                 if( cbp & (1<<i8x8) ) {
6624                     if( IS_8x8DCT(mb_type) ) {
6625                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6626                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6627                             return -1;
6628                     } else
6629                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6630                         const int index = 4*i8x8 + i4x4;
6631                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6632 //START_TIMER
6633                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6634                             return -1;
6635 //STOP_TIMER("decode_residual")
6636                     }
6637                 } else {
6638                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6639                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6640                 }
6641             }
6642         }
6643
6644         if( cbp&0x30 ){
6645             int c;
6646             for( c = 0; c < 2; c++ ) {
6647                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6648                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6649                     return -1;
6650             }
6651         }
6652
6653         if( cbp&0x20 ) {
6654             int c, i;
6655             for( c = 0; c < 2; c++ ) {
6656                 for( i = 0; i < 4; i++ ) {
6657                     const int index = 16 + 4 * c + i;
6658                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6659                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6660                         return -1;
6661                 }
6662             }
6663         } else {
6664             uint8_t * const nnz= &h->non_zero_count_cache[0];
6665             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6666             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6667         }
6668     } else {
6669         uint8_t * const nnz= &h->non_zero_count_cache[0];
6670         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6671         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6672         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6673         h->last_qscale_diff = 0;
6674     }
6675
6676     s->current_picture.qscale_table[mb_xy]= s->qscale;
6677     write_back_non_zero_count(h);
6678
6679     if(MB_MBAFF){
6680         h->ref_count[0] >>= 1;
6681         h->ref_count[1] >>= 1;
6682     }
6683
6684     return 0;
6685 }
6686
6687
6688 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6689     int i, d;
6690     const int index_a = qp + h->slice_alpha_c0_offset;
6691     const int alpha = (alpha_table+52)[index_a];
6692     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6693
6694     if( bS[0] < 4 ) {
6695         int8_t tc[4];
6696         for(i=0; i<4; i++)
6697             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6698         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6699     } else {
6700         /* 16px edge length, because bS=4 is triggered by being at
6701          * the edge of an intra MB, so all 4 bS are the same */
6702             for( d = 0; d < 16; d++ ) {
6703                 const int p0 = pix[-1];
6704                 const int p1 = pix[-2];
6705                 const int p2 = pix[-3];
6706
6707                 const int q0 = pix[0];
6708                 const int q1 = pix[1];
6709                 const int q2 = pix[2];
6710
6711                 if( FFABS( p0 - q0 ) < alpha &&
6712                     FFABS( p1 - p0 ) < beta &&
6713                     FFABS( q1 - q0 ) < beta ) {
6714
6715                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6716                         if( FFABS( p2 - p0 ) < beta)
6717                         {
6718                             const int p3 = pix[-4];
6719                             /* p0', p1', p2' */
6720                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6721                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6722                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6723                         } else {
6724                             /* p0' */
6725                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6726                         }
6727                         if( FFABS( q2 - q0 ) < beta)
6728                         {
6729                             const int q3 = pix[3];
6730                             /* q0', q1', q2' */
6731                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6732                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6733                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6734                         } else {
6735                             /* q0' */
6736                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6737                         }
6738                     }else{
6739                         /* p0', q0' */
6740                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6741                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6742                     }
6743                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6744                 }
6745                 pix += stride;
6746             }
6747     }
6748 }
6749 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6750     int i;
6751     const int index_a = qp + h->slice_alpha_c0_offset;
6752     const int alpha = (alpha_table+52)[index_a];
6753     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6754
6755     if( bS[0] < 4 ) {
6756         int8_t tc[4];
6757         for(i=0; i<4; i++)
6758             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6759         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6760     } else {
6761         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6762     }
6763 }
6764
6765 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6766     int i;
6767     for( i = 0; i < 16; i++, pix += stride) {
6768         int index_a;
6769         int alpha;
6770         int beta;
6771
6772         int qp_index;
6773         int bS_index = (i >> 1);
6774         if (!MB_FIELD) {
6775             bS_index &= ~1;
6776             bS_index |= (i & 1);
6777         }
6778
6779         if( bS[bS_index] == 0 ) {
6780             continue;
6781         }
6782
6783         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6784         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6785         alpha = (alpha_table+52)[index_a];
6786         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6787
6788         if( bS[bS_index] < 4 ) {
6789             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6790             const int p0 = pix[-1];
6791             const int p1 = pix[-2];
6792             const int p2 = pix[-3];
6793             const int q0 = pix[0];
6794             const int q1 = pix[1];
6795             const int q2 = pix[2];
6796
6797             if( FFABS( p0 - q0 ) < alpha &&
6798                 FFABS( p1 - p0 ) < beta &&
6799                 FFABS( q1 - q0 ) < beta ) {
6800                 int tc = tc0;
6801                 int i_delta;
6802
6803                 if( FFABS( p2 - p0 ) < beta ) {
6804                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6805                     tc++;
6806                 }
6807                 if( FFABS( q2 - q0 ) < beta ) {
6808                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6809                     tc++;
6810                 }
6811
6812                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6813                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6814                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6815                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6816             }
6817         }else{
6818             const int p0 = pix[-1];
6819             const int p1 = pix[-2];
6820             const int p2 = pix[-3];
6821
6822             const int q0 = pix[0];
6823             const int q1 = pix[1];
6824             const int q2 = pix[2];
6825
6826             if( FFABS( p0 - q0 ) < alpha &&
6827                 FFABS( p1 - p0 ) < beta &&
6828                 FFABS( q1 - q0 ) < beta ) {
6829
6830                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6831                     if( FFABS( p2 - p0 ) < beta)
6832                     {
6833                         const int p3 = pix[-4];
6834                         /* p0', p1', p2' */
6835                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6836                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6837                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6838                     } else {
6839                         /* p0' */
6840                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6841                     }
6842                     if( FFABS( q2 - q0 ) < beta)
6843                     {
6844                         const int q3 = pix[3];
6845                         /* q0', q1', q2' */
6846                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6847                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6848                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6849                     } else {
6850                         /* q0' */
6851                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6852                     }
6853                 }else{
6854                     /* p0', q0' */
6855                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6856                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6857                 }
6858                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6859             }
6860         }
6861     }
6862 }
6863 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6864     int i;
6865     for( i = 0; i < 8; i++, pix += stride) {
6866         int index_a;
6867         int alpha;
6868         int beta;
6869
6870         int qp_index;
6871         int bS_index = i;
6872
6873         if( bS[bS_index] == 0 ) {
6874             continue;
6875         }
6876
6877         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6878         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6879         alpha = (alpha_table+52)[index_a];
6880         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6881
6882         if( bS[bS_index] < 4 ) {
6883             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6884             const int p0 = pix[-1];
6885             const int p1 = pix[-2];
6886             const int q0 = pix[0];
6887             const int q1 = pix[1];
6888
6889             if( FFABS( p0 - q0 ) < alpha &&
6890                 FFABS( p1 - p0 ) < beta &&
6891                 FFABS( q1 - q0 ) < beta ) {
6892                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6893
6894                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6895                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6896                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6897             }
6898         }else{
6899             const int p0 = pix[-1];
6900             const int p1 = pix[-2];
6901             const int q0 = pix[0];
6902             const int q1 = pix[1];
6903
6904             if( FFABS( p0 - q0 ) < alpha &&
6905                 FFABS( p1 - p0 ) < beta &&
6906                 FFABS( q1 - q0 ) < beta ) {
6907
6908                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6909                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6910                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6911             }
6912         }
6913     }
6914 }
6915
6916 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6917     int i, d;
6918     const int index_a = qp + h->slice_alpha_c0_offset;
6919     const int alpha = (alpha_table+52)[index_a];
6920     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6921     const int pix_next  = stride;
6922
6923     if( bS[0] < 4 ) {
6924         int8_t tc[4];
6925         for(i=0; i<4; i++)
6926             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6927         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6928     } else {
6929         /* 16px edge length, see filter_mb_edgev */
6930             for( d = 0; d < 16; d++ ) {
6931                 const int p0 = pix[-1*pix_next];
6932                 const int p1 = pix[-2*pix_next];
6933                 const int p2 = pix[-3*pix_next];
6934                 const int q0 = pix[0];
6935                 const int q1 = pix[1*pix_next];
6936                 const int q2 = pix[2*pix_next];
6937
6938                 if( FFABS( p0 - q0 ) < alpha &&
6939                     FFABS( p1 - p0 ) < beta &&
6940                     FFABS( q1 - q0 ) < beta ) {
6941
6942                     const int p3 = pix[-4*pix_next];
6943                     const int q3 = pix[ 3*pix_next];
6944
6945                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6946                         if( FFABS( p2 - p0 ) < beta) {
6947                             /* p0', p1', p2' */
6948                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6949                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6950                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6951                         } else {
6952                             /* p0' */
6953                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6954                         }
6955                         if( FFABS( q2 - q0 ) < beta) {
6956                             /* q0', q1', q2' */
6957                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6958                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6959                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6960                         } else {
6961                             /* q0' */
6962                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6963                         }
6964                     }else{
6965                         /* p0', q0' */
6966                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6967                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6968                     }
6969                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6970                 }
6971                 pix++;
6972             }
6973     }
6974 }
6975
6976 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6977     int i;
6978     const int index_a = qp + h->slice_alpha_c0_offset;
6979     const int alpha = (alpha_table+52)[index_a];
6980     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6981
6982     if( bS[0] < 4 ) {
6983         int8_t tc[4];
6984         for(i=0; i<4; i++)
6985             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6986         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6987     } else {
6988         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6989     }
6990 }
6991
6992 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6993     MpegEncContext * const s = &h->s;
6994     int mb_xy, mb_type;
6995     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6996
6997     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
6998         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6999         return;
7000     }
7001     assert(!FRAME_MBAFF);
7002
7003     mb_xy = mb_x + mb_y*s->mb_stride;
7004     mb_type = s->current_picture.mb_type[mb_xy];
7005     qp = s->current_picture.qscale_table[mb_xy];
7006     qp0 = s->current_picture.qscale_table[mb_xy-1];
7007     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
7008     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
7009     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
7010     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
7011     qp0 = (qp + qp0 + 1) >> 1;
7012     qp1 = (qp + qp1 + 1) >> 1;
7013     qpc0 = (qpc + qpc0 + 1) >> 1;
7014     qpc1 = (qpc + qpc1 + 1) >> 1;
7015     qp_thresh = 15 - h->slice_alpha_c0_offset;
7016     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
7017        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
7018         return;
7019
7020     if( IS_INTRA(mb_type) ) {
7021         int16_t bS4[4] = {4,4,4,4};
7022         int16_t bS3[4] = {3,3,3,3};
7023         if( IS_8x8DCT(mb_type) ) {
7024             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7025             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7026             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7027             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7028         } else {
7029             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7030             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
7031             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7032             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
7033             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7034             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
7035             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7036             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
7037         }
7038         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
7039         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
7040         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
7041         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
7042         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7043         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
7044         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7045         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
7046         return;
7047     } else {
7048         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
7049         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
7050         int edges;
7051         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
7052             edges = 4;
7053             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
7054         } else {
7055             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
7056                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
7057             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
7058                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
7059                              ? 3 : 0;
7060             int step = IS_8x8DCT(mb_type) ? 2 : 1;
7061             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
7062             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
7063                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
7064         }
7065         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
7066             bSv[0][0] = 0x0004000400040004ULL;
7067         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
7068             bSv[1][0] = 0x0004000400040004ULL;
7069
7070 #define FILTER(hv,dir,edge)\
7071         if(bSv[dir][edge]) {\
7072             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
7073             if(!(edge&1)) {\
7074                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7075                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7076             }\
7077         }
7078         if( edges == 1 ) {
7079             FILTER(v,0,0);
7080             FILTER(h,1,0);
7081         } else if( IS_8x8DCT(mb_type) ) {
7082             FILTER(v,0,0);
7083             FILTER(v,0,2);
7084             FILTER(h,1,0);
7085             FILTER(h,1,2);
7086         } else {
7087             FILTER(v,0,0);
7088             FILTER(v,0,1);
7089             FILTER(v,0,2);
7090             FILTER(v,0,3);
7091             FILTER(h,1,0);
7092             FILTER(h,1,1);
7093             FILTER(h,1,2);
7094             FILTER(h,1,3);
7095         }
7096 #undef FILTER
7097     }
7098 }
7099
7100 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7101     MpegEncContext * const s = &h->s;
7102     const int mb_xy= mb_x + mb_y*s->mb_stride;
7103     const int mb_type = s->current_picture.mb_type[mb_xy];
7104     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7105     int first_vertical_edge_done = 0;
7106     int dir;
7107     /* FIXME: A given frame may occupy more than one position in
7108      * the reference list. So ref2frm should be populated with
7109      * frame numbers, not indices. */
7110     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7111                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7112
7113     //for sufficiently low qp, filtering wouldn't do anything
7114     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7115     if(!FRAME_MBAFF){
7116         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7117         int qp = s->current_picture.qscale_table[mb_xy];
7118         if(qp <= qp_thresh
7119            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7120            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7121             return;
7122         }
7123     }
7124
7125     if (FRAME_MBAFF
7126             // left mb is in picture
7127             && h->slice_table[mb_xy-1] != 255
7128             // and current and left pair do not have the same interlaced type
7129             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7130             // and left mb is in the same slice if deblocking_filter == 2
7131             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7132         /* First vertical edge is different in MBAFF frames
7133          * There are 8 different bS to compute and 2 different Qp
7134          */
7135         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7136         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7137         int16_t bS[8];
7138         int qp[2];
7139         int chroma_qp[2];
7140         int mb_qp, mbn0_qp, mbn1_qp;
7141         int i;
7142         first_vertical_edge_done = 1;
7143
7144         if( IS_INTRA(mb_type) )
7145             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7146         else {
7147             for( i = 0; i < 8; i++ ) {
7148                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7149
7150                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7151                     bS[i] = 4;
7152                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7153                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7154                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7155                     bS[i] = 2;
7156                 else
7157                     bS[i] = 1;
7158             }
7159         }
7160
7161         mb_qp = s->current_picture.qscale_table[mb_xy];
7162         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7163         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7164         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7165         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7166                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7167         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7168         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7169                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7170
7171         /* Filter edge */
7172         tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7173         { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7174         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7175         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7176         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7177     }
7178     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7179     for( dir = 0; dir < 2; dir++ )
7180     {
7181         int edge;
7182         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7183         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7184         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7185
7186         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7187                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7188         // how often to recheck mv-based bS when iterating between edges
7189         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7190                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7191         // how often to recheck mv-based bS when iterating along each edge
7192         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7193
7194         if (first_vertical_edge_done) {
7195             start = 1;
7196             first_vertical_edge_done = 0;
7197         }
7198
7199         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7200             start = 1;
7201
7202         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7203             && !IS_INTERLACED(mb_type)
7204             && IS_INTERLACED(mbm_type)
7205             ) {
7206             // This is a special case in the norm where the filtering must
7207             // be done twice (one each of the field) even if we are in a
7208             // frame macroblock.
7209             //
7210             static const int nnz_idx[4] = {4,5,6,3};
7211             unsigned int tmp_linesize   = 2 *   linesize;
7212             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7213             int mbn_xy = mb_xy - 2 * s->mb_stride;
7214             int qp, chroma_qp;
7215             int i, j;
7216             int16_t bS[4];
7217
7218             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7219                 if( IS_INTRA(mb_type) ||
7220                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7221                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7222                 } else {
7223                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7224                     for( i = 0; i < 4; i++ ) {
7225                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7226                             mbn_nnz[nnz_idx[i]] != 0 )
7227                             bS[i] = 2;
7228                         else
7229                             bS[i] = 1;
7230                     }
7231                 }
7232                 // Do not use s->qscale as luma quantizer because it has not the same
7233                 // value in IPCM macroblocks.
7234                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7235                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7236                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7237                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7238                 chroma_qp = ( h->chroma_qp +
7239                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7240                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7241                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7242             }
7243
7244             start = 1;
7245         }
7246
7247         /* Calculate bS */
7248         for( edge = start; edge < edges; edge++ ) {
7249             /* mbn_xy: neighbor macroblock */
7250             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7251             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7252             int16_t bS[4];
7253             int qp;
7254
7255             if( (edge&1) && IS_8x8DCT(mb_type) )
7256                 continue;
7257
7258             if( IS_INTRA(mb_type) ||
7259                 IS_INTRA(mbn_type) ) {
7260                 int value;
7261                 if (edge == 0) {
7262                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7263                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7264                     ) {
7265                         value = 4;
7266                     } else {
7267                         value = 3;
7268                     }
7269                 } else {
7270                     value = 3;
7271                 }
7272                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7273             } else {
7274                 int i, l;
7275                 int mv_done;
7276
7277                 if( edge & mask_edge ) {
7278                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7279                     mv_done = 1;
7280                 }
7281                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7282                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7283                     mv_done = 1;
7284                 }
7285                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7286                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7287                     int bn_idx= b_idx - (dir ? 8:1);
7288                     int v = 0;
7289                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7290                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7291                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7292                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7293                     }
7294                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7295                     mv_done = 1;
7296                 }
7297                 else
7298                     mv_done = 0;
7299
7300                 for( i = 0; i < 4; i++ ) {
7301                     int x = dir == 0 ? edge : i;
7302                     int y = dir == 0 ? i    : edge;
7303                     int b_idx= 8 + 4 + x + 8*y;
7304                     int bn_idx= b_idx - (dir ? 8:1);
7305
7306                     if( h->non_zero_count_cache[b_idx] != 0 ||
7307                         h->non_zero_count_cache[bn_idx] != 0 ) {
7308                         bS[i] = 2;
7309                     }
7310                     else if(!mv_done)
7311                     {
7312                         bS[i] = 0;
7313                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7314                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7315                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7316                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7317                                 bS[i] = 1;
7318                                 break;
7319                             }
7320                         }
7321                     }
7322                 }
7323
7324                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7325                     continue;
7326             }
7327
7328             /* Filter edge */
7329             // Do not use s->qscale as luma quantizer because it has not the same
7330             // value in IPCM macroblocks.
7331             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7332             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7333             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7334             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7335             if( dir == 0 ) {
7336                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7337                 if( (edge&1) == 0 ) {
7338                     int chroma_qp = ( h->chroma_qp +
7339                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7340                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7341                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7342                 }
7343             } else {
7344                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7345                 if( (edge&1) == 0 ) {
7346                     int chroma_qp = ( h->chroma_qp +
7347                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7348                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7349                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7350                 }
7351             }
7352         }
7353     }
7354 }
7355
7356 static int decode_slice(H264Context *h){
7357     MpegEncContext * const s = &h->s;
7358     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7359
7360     s->mb_skip_run= -1;
7361
7362     if( h->pps.cabac ) {
7363         int i;
7364
7365         /* realign */
7366         align_get_bits( &s->gb );
7367
7368         /* init cabac */
7369         ff_init_cabac_states( &h->cabac);
7370         ff_init_cabac_decoder( &h->cabac,
7371                                s->gb.buffer + get_bits_count(&s->gb)/8,
7372                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7373         /* calculate pre-state */
7374         for( i= 0; i < 460; i++ ) {
7375             int pre;
7376             if( h->slice_type == I_TYPE )
7377                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7378             else
7379                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7380
7381             if( pre <= 63 )
7382                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7383             else
7384                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7385         }
7386
7387         for(;;){
7388 //START_TIMER
7389             int ret = decode_mb_cabac(h);
7390             int eos;
7391 //STOP_TIMER("decode_mb_cabac")
7392
7393             if(ret>=0) hl_decode_mb(h);
7394
7395             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7396                 s->mb_y++;
7397
7398                 if(ret>=0) ret = decode_mb_cabac(h);
7399
7400                 if(ret>=0) hl_decode_mb(h);
7401                 s->mb_y--;
7402             }
7403             eos = get_cabac_terminate( &h->cabac );
7404
7405             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7406                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7407                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7408                 return -1;
7409             }
7410
7411             if( ++s->mb_x >= s->mb_width ) {
7412                 s->mb_x = 0;
7413                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7414                 ++s->mb_y;
7415                 if(FRAME_MBAFF) {
7416                     ++s->mb_y;
7417                 }
7418             }
7419
7420             if( eos || s->mb_y >= s->mb_height ) {
7421                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7422                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7423                 return 0;
7424             }
7425         }
7426
7427     } else {
7428         for(;;){
7429             int ret = decode_mb_cavlc(h);
7430
7431             if(ret>=0) hl_decode_mb(h);
7432
7433             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7434                 s->mb_y++;
7435                 ret = decode_mb_cavlc(h);
7436
7437                 if(ret>=0) hl_decode_mb(h);
7438                 s->mb_y--;
7439             }
7440
7441             if(ret<0){
7442                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7443                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7444
7445                 return -1;
7446             }
7447
7448             if(++s->mb_x >= s->mb_width){
7449                 s->mb_x=0;
7450                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7451                 ++s->mb_y;
7452                 if(FRAME_MBAFF) {
7453                     ++s->mb_y;
7454                 }
7455                 if(s->mb_y >= s->mb_height){
7456                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7457
7458                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7459                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7460
7461                         return 0;
7462                     }else{
7463                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7464
7465                         return -1;
7466                     }
7467                 }
7468             }
7469
7470             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7471                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7472                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7473                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7474
7475                     return 0;
7476                 }else{
7477                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7478
7479                     return -1;
7480                 }
7481             }
7482         }
7483     }
7484
7485 #if 0
7486     for(;s->mb_y < s->mb_height; s->mb_y++){
7487         for(;s->mb_x < s->mb_width; s->mb_x++){
7488             int ret= decode_mb(h);
7489
7490             hl_decode_mb(h);
7491
7492             if(ret<0){
7493                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7494                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7495
7496                 return -1;
7497             }
7498
7499             if(++s->mb_x >= s->mb_width){
7500                 s->mb_x=0;
7501                 if(++s->mb_y >= s->mb_height){
7502                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7503                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7504
7505                         return 0;
7506                     }else{
7507                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7508
7509                         return -1;
7510                     }
7511                 }
7512             }
7513
7514             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7515                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7516                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7517
7518                     return 0;
7519                 }else{
7520                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7521
7522                     return -1;
7523                 }
7524             }
7525         }
7526         s->mb_x=0;
7527         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7528     }
7529 #endif
7530     return -1; //not reached
7531 }
7532
7533 static int decode_unregistered_user_data(H264Context *h, int size){
7534     MpegEncContext * const s = &h->s;
7535     uint8_t user_data[16+256];
7536     int e, build, i;
7537
7538     if(size<16)
7539         return -1;
7540
7541     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7542         user_data[i]= get_bits(&s->gb, 8);
7543     }
7544
7545     user_data[i]= 0;
7546     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7547     if(e==1 && build>=0)
7548         h->x264_build= build;
7549
7550     if(s->avctx->debug & FF_DEBUG_BUGS)
7551         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7552
7553     for(; i<size; i++)
7554         skip_bits(&s->gb, 8);
7555
7556     return 0;
7557 }
7558
7559 static int decode_sei(H264Context *h){
7560     MpegEncContext * const s = &h->s;
7561
7562     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7563         int size, type;
7564
7565         type=0;
7566         do{
7567             type+= show_bits(&s->gb, 8);
7568         }while(get_bits(&s->gb, 8) == 255);
7569
7570         size=0;
7571         do{
7572             size+= show_bits(&s->gb, 8);
7573         }while(get_bits(&s->gb, 8) == 255);
7574
7575         switch(type){
7576         case 5:
7577             if(decode_unregistered_user_data(h, size) < 0)
7578                 return -1;
7579             break;
7580         default:
7581             skip_bits(&s->gb, 8*size);
7582         }
7583
7584         //FIXME check bits here
7585         align_get_bits(&s->gb);
7586     }
7587
7588     return 0;
7589 }
7590
7591 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7592     MpegEncContext * const s = &h->s;
7593     int cpb_count, i;
7594     cpb_count = get_ue_golomb(&s->gb) + 1;
7595     get_bits(&s->gb, 4); /* bit_rate_scale */
7596     get_bits(&s->gb, 4); /* cpb_size_scale */
7597     for(i=0; i<cpb_count; i++){
7598         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7599         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7600         get_bits1(&s->gb);     /* cbr_flag */
7601     }
7602     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7603     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7604     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7605     get_bits(&s->gb, 5); /* time_offset_length */
7606 }
7607
7608 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7609     MpegEncContext * const s = &h->s;
7610     int aspect_ratio_info_present_flag;
7611     unsigned int aspect_ratio_idc;
7612     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7613
7614     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7615
7616     if( aspect_ratio_info_present_flag ) {
7617         aspect_ratio_idc= get_bits(&s->gb, 8);
7618         if( aspect_ratio_idc == EXTENDED_SAR ) {
7619             sps->sar.num= get_bits(&s->gb, 16);
7620             sps->sar.den= get_bits(&s->gb, 16);
7621         }else if(aspect_ratio_idc < 14){
7622             sps->sar=  pixel_aspect[aspect_ratio_idc];
7623         }else{
7624             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7625             return -1;
7626         }
7627     }else{
7628         sps->sar.num=
7629         sps->sar.den= 0;
7630     }
7631 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7632
7633     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7634         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7635     }
7636
7637     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7638         get_bits(&s->gb, 3);    /* video_format */
7639         get_bits1(&s->gb);      /* video_full_range_flag */
7640         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7641             get_bits(&s->gb, 8); /* colour_primaries */
7642             get_bits(&s->gb, 8); /* transfer_characteristics */
7643             get_bits(&s->gb, 8); /* matrix_coefficients */
7644         }
7645     }
7646
7647     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7648         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7649         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7650     }
7651
7652     sps->timing_info_present_flag = get_bits1(&s->gb);
7653     if(sps->timing_info_present_flag){
7654         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7655         sps->time_scale = get_bits_long(&s->gb, 32);
7656         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7657     }
7658
7659     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7660     if(nal_hrd_parameters_present_flag)
7661         decode_hrd_parameters(h, sps);
7662     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7663     if(vcl_hrd_parameters_present_flag)
7664         decode_hrd_parameters(h, sps);
7665     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7666         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7667     get_bits1(&s->gb);         /* pic_struct_present_flag */
7668
7669     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7670     if(sps->bitstream_restriction_flag){
7671         unsigned int num_reorder_frames;
7672         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7673         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7674         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7675         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7676         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7677         num_reorder_frames= get_ue_golomb(&s->gb);
7678         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7679
7680         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7681             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7682             return -1;
7683         }
7684
7685         sps->num_reorder_frames= num_reorder_frames;
7686     }
7687
7688     return 0;
7689 }
7690
7691 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7692                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7693     MpegEncContext * const s = &h->s;
7694     int i, last = 8, next = 8;
7695     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7696     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7697         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7698     else
7699     for(i=0;i<size;i++){
7700         if(next)
7701             next = (last + get_se_golomb(&s->gb)) & 0xff;
7702         if(!i && !next){ /* matrix not written, we use the preset one */
7703             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7704             break;
7705         }
7706         last = factors[scan[i]] = next ? next : last;
7707     }
7708 }
7709
7710 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7711                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7712     MpegEncContext * const s = &h->s;
7713     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7714     const uint8_t *fallback[4] = {
7715         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7716         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7717         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7718         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7719     };
7720     if(get_bits1(&s->gb)){
7721         sps->scaling_matrix_present |= is_sps;
7722         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7723         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7724         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7725         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7726         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7727         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7728         if(is_sps || pps->transform_8x8_mode){
7729             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7730             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7731         }
7732     } else if(fallback_sps) {
7733         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7734         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7735     }
7736 }
7737
7738 static inline int decode_seq_parameter_set(H264Context *h){
7739     MpegEncContext * const s = &h->s;
7740     int profile_idc, level_idc;
7741     unsigned int sps_id, tmp, mb_width, mb_height;
7742     int i;
7743     SPS *sps;
7744
7745     profile_idc= get_bits(&s->gb, 8);
7746     get_bits1(&s->gb);   //constraint_set0_flag
7747     get_bits1(&s->gb);   //constraint_set1_flag
7748     get_bits1(&s->gb);   //constraint_set2_flag
7749     get_bits1(&s->gb);   //constraint_set3_flag
7750     get_bits(&s->gb, 4); // reserved
7751     level_idc= get_bits(&s->gb, 8);
7752     sps_id= get_ue_golomb(&s->gb);
7753
7754     if (sps_id >= MAX_SPS_COUNT){
7755         // ok it has gone out of hand, someone is sending us bad stuff.
7756         av_log(h->s.avctx, AV_LOG_ERROR, "illegal sps_id (%d)\n", sps_id);
7757         return -1;
7758     }
7759
7760     sps= &h->sps_buffer[ sps_id ];
7761     sps->profile_idc= profile_idc;
7762     sps->level_idc= level_idc;
7763
7764     if(sps->profile_idc >= 100){ //high profile
7765         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7766             get_bits1(&s->gb);  //residual_color_transform_flag
7767         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7768         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7769         sps->transform_bypass = get_bits1(&s->gb);
7770         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7771     }else
7772         sps->scaling_matrix_present = 0;
7773
7774     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7775     sps->poc_type= get_ue_golomb(&s->gb);
7776
7777     if(sps->poc_type == 0){ //FIXME #define
7778         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7779     } else if(sps->poc_type == 1){//FIXME #define
7780         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7781         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7782         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7783         tmp= get_ue_golomb(&s->gb);
7784
7785         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7786             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7787             return -1;
7788         }
7789         sps->poc_cycle_length= tmp;
7790
7791         for(i=0; i<sps->poc_cycle_length; i++)
7792             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7793     }else if(sps->poc_type != 2){
7794         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7795         return -1;
7796     }
7797
7798     tmp= get_ue_golomb(&s->gb);
7799     if(tmp > MAX_PICTURE_COUNT-2){
7800         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7801     }
7802     sps->ref_frame_count= tmp;
7803     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7804     mb_width= get_ue_golomb(&s->gb) + 1;
7805     mb_height= get_ue_golomb(&s->gb) + 1;
7806     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7807        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7808         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7809         return -1;
7810     }
7811     sps->mb_width = mb_width;
7812     sps->mb_height= mb_height;
7813
7814     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7815     if(!sps->frame_mbs_only_flag)
7816         sps->mb_aff= get_bits1(&s->gb);
7817     else
7818         sps->mb_aff= 0;
7819
7820     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7821
7822 #ifndef ALLOW_INTERLACE
7823     if(sps->mb_aff)
7824         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7825 #endif
7826     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7827         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7828
7829     sps->crop= get_bits1(&s->gb);
7830     if(sps->crop){
7831         sps->crop_left  = get_ue_golomb(&s->gb);
7832         sps->crop_right = get_ue_golomb(&s->gb);
7833         sps->crop_top   = get_ue_golomb(&s->gb);
7834         sps->crop_bottom= get_ue_golomb(&s->gb);
7835         if(sps->crop_left || sps->crop_top){
7836             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7837         }
7838     }else{
7839         sps->crop_left  =
7840         sps->crop_right =
7841         sps->crop_top   =
7842         sps->crop_bottom= 0;
7843     }
7844
7845     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7846     if( sps->vui_parameters_present_flag )
7847         decode_vui_parameters(h, sps);
7848
7849     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7850         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7851                sps_id, sps->profile_idc, sps->level_idc,
7852                sps->poc_type,
7853                sps->ref_frame_count,
7854                sps->mb_width, sps->mb_height,
7855                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7856                sps->direct_8x8_inference_flag ? "8B8" : "",
7857                sps->crop_left, sps->crop_right,
7858                sps->crop_top, sps->crop_bottom,
7859                sps->vui_parameters_present_flag ? "VUI" : ""
7860                );
7861     }
7862     return 0;
7863 }
7864
7865 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7866     MpegEncContext * const s = &h->s;
7867     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7868     PPS *pps;
7869
7870     if(pps_id>=MAX_PPS_COUNT){
7871         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
7872         return -1;
7873     }
7874     pps = &h->pps_buffer[pps_id];
7875
7876     tmp= get_ue_golomb(&s->gb);
7877     if(tmp>=MAX_SPS_COUNT){
7878         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7879         return -1;
7880     }
7881     pps->sps_id= tmp;
7882
7883     pps->cabac= get_bits1(&s->gb);
7884     pps->pic_order_present= get_bits1(&s->gb);
7885     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7886     if(pps->slice_group_count > 1 ){
7887         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7888         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7889         switch(pps->mb_slice_group_map_type){
7890         case 0:
7891 #if 0
7892 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7893 |    run_length[ i ]                                |1  |ue(v)   |
7894 #endif
7895             break;
7896         case 2:
7897 #if 0
7898 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7899 |{                                                  |   |        |
7900 |    top_left_mb[ i ]                               |1  |ue(v)   |
7901 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7902 |   }                                               |   |        |
7903 #endif
7904             break;
7905         case 3:
7906         case 4:
7907         case 5:
7908 #if 0
7909 |   slice_group_change_direction_flag               |1  |u(1)    |
7910 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7911 #endif
7912             break;
7913         case 6:
7914 #if 0
7915 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7916 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7917 |)                                                  |   |        |
7918 |    slice_group_id[ i ]                            |1  |u(v)    |
7919 #endif
7920             break;
7921         }
7922     }
7923     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7924     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7925     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7926         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7927         pps->ref_count[0]= pps->ref_count[1]= 1;
7928         return -1;
7929     }
7930
7931     pps->weighted_pred= get_bits1(&s->gb);
7932     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7933     pps->init_qp= get_se_golomb(&s->gb) + 26;
7934     pps->init_qs= get_se_golomb(&s->gb) + 26;
7935     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7936     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7937     pps->constrained_intra_pred= get_bits1(&s->gb);
7938     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7939
7940     pps->transform_8x8_mode= 0;
7941     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7942     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7943     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7944
7945     if(get_bits_count(&s->gb) < bit_length){
7946         pps->transform_8x8_mode= get_bits1(&s->gb);
7947         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7948         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7949     }
7950
7951     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7952         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7953                pps_id, pps->sps_id,
7954                pps->cabac ? "CABAC" : "CAVLC",
7955                pps->slice_group_count,
7956                pps->ref_count[0], pps->ref_count[1],
7957                pps->weighted_pred ? "weighted" : "",
7958                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7959                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7960                pps->constrained_intra_pred ? "CONSTR" : "",
7961                pps->redundant_pic_cnt_present ? "REDU" : "",
7962                pps->transform_8x8_mode ? "8x8DCT" : ""
7963                );
7964     }
7965
7966     return 0;
7967 }
7968
7969 /**
7970  * finds the end of the current frame in the bitstream.
7971  * @return the position of the first byte of the next frame, or -1
7972  */
7973 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7974     int i;
7975     uint32_t state;
7976     ParseContext *pc = &(h->s.parse_context);
7977 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7978 //    mb_addr= pc->mb_addr - 1;
7979     state= pc->state;
7980     for(i=0; i<=buf_size; i++){
7981         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7982             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7983             if(pc->frame_start_found){
7984                 // If there isn't one more byte in the buffer
7985                 // the test on first_mb_in_slice cannot be done yet
7986                 // do it at next call.
7987                 if (i >= buf_size) break;
7988                 if (buf[i] & 0x80) {
7989                     // first_mb_in_slice is 0, probably the first nal of a new
7990                     // slice
7991                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7992                     pc->state=-1;
7993                     pc->frame_start_found= 0;
7994                     return i-4;
7995                 }
7996             }
7997             pc->frame_start_found = 1;
7998         }
7999         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
8000            if(pc->frame_start_found){
8001                 pc->state=-1;
8002                 pc->frame_start_found= 0;
8003                 return i-4;
8004            }
8005         }
8006         if (i<buf_size)
8007             state= (state<<8) | buf[i];
8008     }
8009
8010     pc->state= state;
8011     return END_NOT_FOUND;
8012 }
8013
8014 #ifdef CONFIG_H264_PARSER
8015 static int h264_parse(AVCodecParserContext *s,
8016                       AVCodecContext *avctx,
8017                       uint8_t **poutbuf, int *poutbuf_size,
8018                       const uint8_t *buf, int buf_size)
8019 {
8020     H264Context *h = s->priv_data;
8021     ParseContext *pc = &h->s.parse_context;
8022     int next;
8023
8024     next= find_frame_end(h, buf, buf_size);
8025
8026     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
8027         *poutbuf = NULL;
8028         *poutbuf_size = 0;
8029         return buf_size;
8030     }
8031
8032     *poutbuf = (uint8_t *)buf;
8033     *poutbuf_size = buf_size;
8034     return next;
8035 }
8036
8037 static int h264_split(AVCodecContext *avctx,
8038                       const uint8_t *buf, int buf_size)
8039 {
8040     int i;
8041     uint32_t state = -1;
8042     int has_sps= 0;
8043
8044     for(i=0; i<=buf_size; i++){
8045         if((state&0xFFFFFF1F) == 0x107)
8046             has_sps=1;
8047 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
8048         }*/
8049         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
8050             if(has_sps){
8051                 while(i>4 && buf[i-5]==0) i--;
8052                 return i-4;
8053             }
8054         }
8055         if (i<buf_size)
8056             state= (state<<8) | buf[i];
8057     }
8058     return 0;
8059 }
8060 #endif /* CONFIG_H264_PARSER */
8061
8062 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
8063     MpegEncContext * const s = &h->s;
8064     AVCodecContext * const avctx= s->avctx;
8065     int buf_index=0;
8066 #if 0
8067     int i;
8068     for(i=0; i<50; i++){
8069         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
8070     }
8071 #endif
8072     h->slice_num = 0;
8073     s->current_picture_ptr= NULL;
8074     for(;;){
8075         int consumed;
8076         int dst_length;
8077         int bit_length;
8078         uint8_t *ptr;
8079         int i, nalsize = 0;
8080
8081       if(h->is_avc) {
8082         if(buf_index >= buf_size) break;
8083         nalsize = 0;
8084         for(i = 0; i < h->nal_length_size; i++)
8085             nalsize = (nalsize << 8) | buf[buf_index++];
8086         if(nalsize <= 1 || nalsize > buf_size){
8087             if(nalsize == 1){
8088                 buf_index++;
8089                 continue;
8090             }else{
8091                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
8092                 break;
8093             }
8094         }
8095       } else {
8096         // start code prefix search
8097         for(; buf_index + 3 < buf_size; buf_index++){
8098             // this should allways succeed in the first iteration
8099             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
8100                 break;
8101         }
8102
8103         if(buf_index+3 >= buf_size) break;
8104
8105         buf_index+=3;
8106       }
8107
8108         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
8109         if (ptr==NULL || dst_length <= 0){
8110             return -1;
8111         }
8112         while(ptr[dst_length - 1] == 0 && dst_length > 1)
8113             dst_length--;
8114         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
8115
8116         if(s->avctx->debug&FF_DEBUG_STARTCODE){
8117             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
8118         }
8119
8120         if (h->is_avc && (nalsize != consumed))
8121             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
8122
8123         buf_index += consumed;
8124
8125         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
8126            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
8127             continue;
8128
8129         switch(h->nal_unit_type){
8130         case NAL_IDR_SLICE:
8131             idr(h); //FIXME ensure we don't loose some frames if there is reordering
8132         case NAL_SLICE:
8133             init_get_bits(&s->gb, ptr, bit_length);
8134             h->intra_gb_ptr=
8135             h->inter_gb_ptr= &s->gb;
8136             s->data_partitioning = 0;
8137
8138             if(decode_slice_header(h) < 0){
8139                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8140                 break;
8141             }
8142             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8143             if(h->redundant_pic_count==0 && s->hurry_up < 5
8144                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8145                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8146                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8147                && avctx->skip_frame < AVDISCARD_ALL)
8148                 decode_slice(h);
8149             break;
8150         case NAL_DPA:
8151             init_get_bits(&s->gb, ptr, bit_length);
8152             h->intra_gb_ptr=
8153             h->inter_gb_ptr= NULL;
8154             s->data_partitioning = 1;
8155
8156             if(decode_slice_header(h) < 0){
8157                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8158             }
8159             break;
8160         case NAL_DPB:
8161             init_get_bits(&h->intra_gb, ptr, bit_length);
8162             h->intra_gb_ptr= &h->intra_gb;
8163             break;
8164         case NAL_DPC:
8165             init_get_bits(&h->inter_gb, ptr, bit_length);
8166             h->inter_gb_ptr= &h->inter_gb;
8167
8168             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8169                && s->context_initialized
8170                && s->hurry_up < 5
8171                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8172                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8173                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8174                && avctx->skip_frame < AVDISCARD_ALL)
8175                 decode_slice(h);
8176             break;
8177         case NAL_SEI:
8178             init_get_bits(&s->gb, ptr, bit_length);
8179             decode_sei(h);
8180             break;
8181         case NAL_SPS:
8182             init_get_bits(&s->gb, ptr, bit_length);
8183             decode_seq_parameter_set(h);
8184
8185             if(s->flags& CODEC_FLAG_LOW_DELAY)
8186                 s->low_delay=1;
8187
8188             if(avctx->has_b_frames < 2)
8189                 avctx->has_b_frames= !s->low_delay;
8190             break;
8191         case NAL_PPS:
8192             init_get_bits(&s->gb, ptr, bit_length);
8193
8194             decode_picture_parameter_set(h, bit_length);
8195
8196             break;
8197         case NAL_AUD:
8198         case NAL_END_SEQUENCE:
8199         case NAL_END_STREAM:
8200         case NAL_FILLER_DATA:
8201         case NAL_SPS_EXT:
8202         case NAL_AUXILIARY_SLICE:
8203             break;
8204         default:
8205             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8206         }
8207     }
8208
8209     if(!s->current_picture_ptr) return buf_index; //no frame
8210
8211     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8212     s->current_picture_ptr->pict_type= s->pict_type;
8213
8214     h->prev_frame_num_offset= h->frame_num_offset;
8215     h->prev_frame_num= h->frame_num;
8216     if(s->current_picture_ptr->reference){
8217         h->prev_poc_msb= h->poc_msb;
8218         h->prev_poc_lsb= h->poc_lsb;
8219     }
8220     if(s->current_picture_ptr->reference)
8221         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8222
8223     ff_er_frame_end(s);
8224
8225     MPV_frame_end(s);
8226
8227     return buf_index;
8228 }
8229
8230 /**
8231  * returns the number of bytes consumed for building the current frame
8232  */
8233 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8234     if(s->flags&CODEC_FLAG_TRUNCATED){
8235         pos -= s->parse_context.last_index;
8236         if(pos<0) pos=0; // FIXME remove (unneeded?)
8237
8238         return pos;
8239     }else{
8240         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8241         if(pos+10>buf_size) pos=buf_size; // oops ;)
8242
8243         return pos;
8244     }
8245 }
8246
8247 static int decode_frame(AVCodecContext *avctx,
8248                              void *data, int *data_size,
8249                              uint8_t *buf, int buf_size)
8250 {
8251     H264Context *h = avctx->priv_data;
8252     MpegEncContext *s = &h->s;
8253     AVFrame *pict = data;
8254     int buf_index;
8255
8256     s->flags= avctx->flags;
8257     s->flags2= avctx->flags2;
8258
8259    /* no supplementary picture */
8260     if (buf_size == 0) {
8261         return 0;
8262     }
8263
8264     if(s->flags&CODEC_FLAG_TRUNCATED){
8265         int next= find_frame_end(h, buf, buf_size);
8266
8267         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8268             return buf_size;
8269 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8270     }
8271
8272     if(h->is_avc && !h->got_avcC) {
8273         int i, cnt, nalsize;
8274         unsigned char *p = avctx->extradata;
8275         if(avctx->extradata_size < 7) {
8276             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8277             return -1;
8278         }
8279         if(*p != 1) {
8280             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8281             return -1;
8282         }
8283         /* sps and pps in the avcC always have length coded with 2 bytes,
8284            so put a fake nal_length_size = 2 while parsing them */
8285         h->nal_length_size = 2;
8286         // Decode sps from avcC
8287         cnt = *(p+5) & 0x1f; // Number of sps
8288         p += 6;
8289         for (i = 0; i < cnt; i++) {
8290             nalsize = AV_RB16(p) + 2;
8291             if(decode_nal_units(h, p, nalsize) < 0) {
8292                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8293                 return -1;
8294             }
8295             p += nalsize;
8296         }
8297         // Decode pps from avcC
8298         cnt = *(p++); // Number of pps
8299         for (i = 0; i < cnt; i++) {
8300             nalsize = AV_RB16(p) + 2;
8301             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8302                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8303                 return -1;
8304             }
8305             p += nalsize;
8306         }
8307         // Now store right nal length size, that will be use to parse all other nals
8308         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8309         // Do not reparse avcC
8310         h->got_avcC = 1;
8311     }
8312
8313     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
8314         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8315             return -1;
8316     }
8317
8318     buf_index=decode_nal_units(h, buf, buf_size);
8319     if(buf_index < 0)
8320         return -1;
8321
8322     //FIXME do something with unavailable reference frames
8323
8324 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8325     if(!s->current_picture_ptr){
8326         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8327         return -1;
8328     }
8329
8330     {
8331         Picture *out = s->current_picture_ptr;
8332 #if 0 //decode order
8333         *data_size = sizeof(AVFrame);
8334 #else
8335         /* Sort B-frames into display order */
8336         Picture *cur = s->current_picture_ptr;
8337         Picture *prev = h->delayed_output_pic;
8338         int i, pics, cross_idr, out_of_order, out_idx;
8339
8340         if(h->sps.bitstream_restriction_flag
8341            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8342             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8343             s->low_delay = 0;
8344         }
8345
8346         pics = 0;
8347         while(h->delayed_pic[pics]) pics++;
8348
8349         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
8350
8351         h->delayed_pic[pics++] = cur;
8352         if(cur->reference == 0)
8353             cur->reference = 1;
8354
8355         cross_idr = 0;
8356         for(i=0; h->delayed_pic[i]; i++)
8357             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8358                 cross_idr = 1;
8359
8360         out = h->delayed_pic[0];
8361         out_idx = 0;
8362         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8363             if(h->delayed_pic[i]->poc < out->poc){
8364                 out = h->delayed_pic[i];
8365                 out_idx = i;
8366             }
8367
8368         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8369         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8370             { }
8371         else if(prev && pics <= s->avctx->has_b_frames)
8372             out = prev;
8373         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8374            || (s->low_delay &&
8375             ((!cross_idr && prev && out->poc > prev->poc + 2)
8376              || cur->pict_type == B_TYPE)))
8377         {
8378             s->low_delay = 0;
8379             s->avctx->has_b_frames++;
8380             out = prev;
8381         }
8382         else if(out_of_order)
8383             out = prev;
8384
8385         if(out_of_order || pics > s->avctx->has_b_frames){
8386             for(i=out_idx; h->delayed_pic[i]; i++)
8387                 h->delayed_pic[i] = h->delayed_pic[i+1];
8388         }
8389
8390         if(prev == out)
8391             *data_size = 0;
8392         else
8393             *data_size = sizeof(AVFrame);
8394         if(prev && prev != out && prev->reference == 1)
8395             prev->reference = 0;
8396         h->delayed_output_pic = out;
8397 #endif
8398
8399         if(out)
8400             *pict= *(AVFrame*)out;
8401         else
8402             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8403     }
8404
8405     assert(pict->data[0] || !*data_size);
8406     ff_print_debug_info(s, pict);
8407 //printf("out %d\n", (int)pict->data[0]);
8408 #if 0 //?
8409
8410     /* Return the Picture timestamp as the frame number */
8411     /* we substract 1 because it is added on utils.c    */
8412     avctx->frame_number = s->picture_number - 1;
8413 #endif
8414     return get_consumed_bytes(s, buf_index, buf_size);
8415 }
8416 #if 0
8417 static inline void fill_mb_avail(H264Context *h){
8418     MpegEncContext * const s = &h->s;
8419     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8420
8421     if(s->mb_y){
8422         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8423         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8424         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8425     }else{
8426         h->mb_avail[0]=
8427         h->mb_avail[1]=
8428         h->mb_avail[2]= 0;
8429     }
8430     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8431     h->mb_avail[4]= 1; //FIXME move out
8432     h->mb_avail[5]= 0; //FIXME move out
8433 }
8434 #endif
8435
8436 #if 0 //selftest
8437 #define COUNT 8000
8438 #define SIZE (COUNT*40)
8439 int main(){
8440     int i;
8441     uint8_t temp[SIZE];
8442     PutBitContext pb;
8443     GetBitContext gb;
8444 //    int int_temp[10000];
8445     DSPContext dsp;
8446     AVCodecContext avctx;
8447
8448     dsputil_init(&dsp, &avctx);
8449
8450     init_put_bits(&pb, temp, SIZE);
8451     printf("testing unsigned exp golomb\n");
8452     for(i=0; i<COUNT; i++){
8453         START_TIMER
8454         set_ue_golomb(&pb, i);
8455         STOP_TIMER("set_ue_golomb");
8456     }
8457     flush_put_bits(&pb);
8458
8459     init_get_bits(&gb, temp, 8*SIZE);
8460     for(i=0; i<COUNT; i++){
8461         int j, s;
8462
8463         s= show_bits(&gb, 24);
8464
8465         START_TIMER
8466         j= get_ue_golomb(&gb);
8467         if(j != i){
8468             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8469 //            return -1;
8470         }
8471         STOP_TIMER("get_ue_golomb");
8472     }
8473
8474
8475     init_put_bits(&pb, temp, SIZE);
8476     printf("testing signed exp golomb\n");
8477     for(i=0; i<COUNT; i++){
8478         START_TIMER
8479         set_se_golomb(&pb, i - COUNT/2);
8480         STOP_TIMER("set_se_golomb");
8481     }
8482     flush_put_bits(&pb);
8483
8484     init_get_bits(&gb, temp, 8*SIZE);
8485     for(i=0; i<COUNT; i++){
8486         int j, s;
8487
8488         s= show_bits(&gb, 24);
8489
8490         START_TIMER
8491         j= get_se_golomb(&gb);
8492         if(j != i - COUNT/2){
8493             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8494 //            return -1;
8495         }
8496         STOP_TIMER("get_se_golomb");
8497     }
8498
8499     printf("testing 4x4 (I)DCT\n");
8500
8501     DCTELEM block[16];
8502     uint8_t src[16], ref[16];
8503     uint64_t error= 0, max_error=0;
8504
8505     for(i=0; i<COUNT; i++){
8506         int j;
8507 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8508         for(j=0; j<16; j++){
8509             ref[j]= random()%255;
8510             src[j]= random()%255;
8511         }
8512
8513         h264_diff_dct_c(block, src, ref, 4);
8514
8515         //normalize
8516         for(j=0; j<16; j++){
8517 //            printf("%d ", block[j]);
8518             block[j]= block[j]*4;
8519             if(j&1) block[j]= (block[j]*4 + 2)/5;
8520             if(j&4) block[j]= (block[j]*4 + 2)/5;
8521         }
8522 //        printf("\n");
8523
8524         s->dsp.h264_idct_add(ref, block, 4);
8525 /*        for(j=0; j<16; j++){
8526             printf("%d ", ref[j]);
8527         }
8528         printf("\n");*/
8529
8530         for(j=0; j<16; j++){
8531             int diff= FFABS(src[j] - ref[j]);
8532
8533             error+= diff*diff;
8534             max_error= FFMAX(max_error, diff);
8535         }
8536     }
8537     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8538 #if 0
8539     printf("testing quantizer\n");
8540     for(qp=0; qp<52; qp++){
8541         for(i=0; i<16; i++)
8542             src1_block[i]= src2_block[i]= random()%255;
8543
8544     }
8545 #endif
8546     printf("Testing NAL layer\n");
8547
8548     uint8_t bitstream[COUNT];
8549     uint8_t nal[COUNT*2];
8550     H264Context h;
8551     memset(&h, 0, sizeof(H264Context));
8552
8553     for(i=0; i<COUNT; i++){
8554         int zeros= i;
8555         int nal_length;
8556         int consumed;
8557         int out_length;
8558         uint8_t *out;
8559         int j;
8560
8561         for(j=0; j<COUNT; j++){
8562             bitstream[j]= (random() % 255) + 1;
8563         }
8564
8565         for(j=0; j<zeros; j++){
8566             int pos= random() % COUNT;
8567             while(bitstream[pos] == 0){
8568                 pos++;
8569                 pos %= COUNT;
8570             }
8571             bitstream[pos]=0;
8572         }
8573
8574         START_TIMER
8575
8576         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8577         if(nal_length<0){
8578             printf("encoding failed\n");
8579             return -1;
8580         }
8581
8582         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8583
8584         STOP_TIMER("NAL")
8585
8586         if(out_length != COUNT){
8587             printf("incorrect length %d %d\n", out_length, COUNT);
8588             return -1;
8589         }
8590
8591         if(consumed != nal_length){
8592             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8593             return -1;
8594         }
8595
8596         if(memcmp(bitstream, out, COUNT)){
8597             printf("missmatch\n");
8598             return -1;
8599         }
8600     }
8601
8602     printf("Testing RBSP\n");
8603
8604
8605     return 0;
8606 }
8607 #endif
8608
8609
8610 static int decode_end(AVCodecContext *avctx)
8611 {
8612     H264Context *h = avctx->priv_data;
8613     MpegEncContext *s = &h->s;
8614
8615     av_freep(&h->rbsp_buffer);
8616     free_tables(h); //FIXME cleanup init stuff perhaps
8617     MPV_common_end(s);
8618
8619 //    memset(h, 0, sizeof(H264Context));
8620
8621     return 0;
8622 }
8623
8624
8625 AVCodec h264_decoder = {
8626     "h264",
8627     CODEC_TYPE_VIDEO,
8628     CODEC_ID_H264,
8629     sizeof(H264Context),
8630     decode_init,
8631     NULL,
8632     decode_end,
8633     decode_frame,
8634     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8635     .flush= flush_dpb,
8636 };
8637
8638 #ifdef CONFIG_H264_PARSER
8639 AVCodecParser h264_parser = {
8640     { CODEC_ID_H264 },
8641     sizeof(H264Context),
8642     NULL,
8643     h264_parse,
8644     ff_parse_close,
8645     h264_split,
8646 };
8647 #endif
8648
8649 #include "svq3.c"