git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "common.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264data.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 #define interlaced_dct interlaced_dct_is_a_bad_name
  42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  43
  44 #define LUMA_DC_BLOCK_INDEX   25
  45 #define CHROMA_DC_BLOCK_INDEX 26
  46
  47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  48 #define COEFF_TOKEN_VLC_BITS           8
  49 #define TOTAL_ZEROS_VLC_BITS           9
  50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  51 #define RUN_VLC_BITS                   3
  52 #define RUN7_VLC_BITS                  6
  53
  54 #define MAX_SPS_COUNT 32
  55 #define MAX_PPS_COUNT 256
  56
  57 #define MAX_MMCO_COUNT 66
  58
  59 /* Compiling in interlaced support reduces the speed
  60  * of progressive decoding by about 2%. */
  61 #define ALLOW_INTERLACE
  62
  63 #ifdef ALLOW_INTERLACE
  64 #define MB_MBAFF h->mb_mbaff
  65 #define MB_FIELD h->mb_field_decoding_flag
  66 #define FRAME_MBAFF h->mb_aff_frame
  67 #else
  68 #define MB_MBAFF 0
  69 #define MB_FIELD 0
  70 #define FRAME_MBAFF 0
  71 #undef  IS_INTERLACED
  72 #define IS_INTERLACED(mb_type) 0
  73 #endif
  74
  75 /**
  76  * Sequence parameter set
  77  */
  78 typedef struct SPS{
  79
  80     int profile_idc;
  81     int level_idc;
  82     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  83     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  84     int poc_type;                      ///< pic_order_cnt_type
  85     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  86     int delta_pic_order_always_zero_flag;
  87     int offset_for_non_ref_pic;
  88     int offset_for_top_to_bottom_field;
  89     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  90     int ref_frame_count;               ///< num_ref_frames
  91     int gaps_in_frame_num_allowed_flag;
  92     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  93     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  94     int frame_mbs_only_flag;
  95     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  96     int direct_8x8_inference_flag;
  97     int crop;                   ///< frame_cropping_flag
  98     int crop_left;              ///< frame_cropping_rect_left_offset
  99     int crop_right;             ///< frame_cropping_rect_right_offset
 100     int crop_top;               ///< frame_cropping_rect_top_offset
 101     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 102     int vui_parameters_present_flag;
 103     AVRational sar;
 104     int timing_info_present_flag;
 105     uint32_t num_units_in_tick;
 106     uint32_t time_scale;
 107     int fixed_frame_rate_flag;
 108     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 109     int bitstream_restriction_flag;
 110     int num_reorder_frames;
 111     int scaling_matrix_present;
 112     uint8_t scaling_matrix4[6][16];
 113     uint8_t scaling_matrix8[2][64];
 114 }SPS;
 115
 116 /**
 117  * Picture parameter set
 118  */
 119 typedef struct PPS{
 120     unsigned int sps_id;
 121     int cabac;                  ///< entropy_coding_mode_flag
 122     int pic_order_present;      ///< pic_order_present_flag
 123     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 124     int mb_slice_group_map_type;
 125     unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
 126     int weighted_pred;          ///< weighted_pred_flag
 127     int weighted_bipred_idc;
 128     int init_qp;                ///< pic_init_qp_minus26 + 26
 129     int init_qs;                ///< pic_init_qs_minus26 + 26
 130     int chroma_qp_index_offset;
 131     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 132     int constrained_intra_pred; ///< constrained_intra_pred_flag
 133     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 134     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 135     uint8_t scaling_matrix4[6][16];
 136     uint8_t scaling_matrix8[2][64];
 137 }PPS;
 138
 139 /**
 140  * Memory management control operation opcode.
 141  */
 142 typedef enum MMCOOpcode{
 143     MMCO_END=0,
 144     MMCO_SHORT2UNUSED,
 145     MMCO_LONG2UNUSED,
 146     MMCO_SHORT2LONG,
 147     MMCO_SET_MAX_LONG,
 148     MMCO_RESET,
 149     MMCO_LONG,
 150 } MMCOOpcode;
 151
 152 /**
 153  * Memory management control operation.
 154  */
 155 typedef struct MMCO{
 156     MMCOOpcode opcode;
 157     int short_frame_num;
 158     int long_index;
 159 } MMCO;
 160
 161 /**
 162  * H264Context
 163  */
 164 typedef struct H264Context{
 165     MpegEncContext s;
 166     int nal_ref_idc;
 167     int nal_unit_type;
 168     uint8_t *rbsp_buffer;
 169     unsigned int rbsp_buffer_size;
 170
 171     /**
 172       * Used to parse AVC variant of h264
 173       */
 174     int is_avc; ///< this flag is != 0 if codec is avc1
 175     int got_avcC; ///< flag used to parse avcC data only once
 176     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 177
 178     int chroma_qp; //QPc
 179
 180     int prev_mb_skipped;
 181     int next_mb_skipped;
 182
 183     //prediction stuff
 184     int chroma_pred_mode;
 185     int intra16x16_pred_mode;
 186
 187     int top_mb_xy;
 188     int left_mb_xy[2];
 189
 190     int8_t intra4x4_pred_mode_cache[5*8];
 191     int8_t (*intra4x4_pred_mode)[8];
 192     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 193     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 194     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 195     void (*pred16x16[4+3])(uint8_t *src, int stride);
 196     unsigned int topleft_samples_available;
 197     unsigned int top_samples_available;
 198     unsigned int topright_samples_available;
 199     unsigned int left_samples_available;
 200     uint8_t (*top_borders[2])[16+2*8];
 201     uint8_t left_border[2*(17+2*9)];
 202
 203     /**
 204      * non zero coeff count cache.
 205      * is 64 if not available.
 206      */
 207     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 208     uint8_t (*non_zero_count)[16];
 209
 210     /**
 211      * Motion vector cache.
 212      */
 213     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 214     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 215 #define LIST_NOT_USED -1 //FIXME rename?
 216 #define PART_NOT_AVAILABLE -2
 217
 218     /**
 219      * is 1 if the specific list MV&references are set to 0,0,-2.
 220      */
 221     int mv_cache_clean[2];
 222
 223     /**
 224      * number of neighbors (top and/or left) that used 8x8 dct
 225      */
 226     int neighbor_transform_size;
 227
 228     /**
 229      * block_offset[ 0..23] for frame macroblocks
 230      * block_offset[24..47] for field macroblocks
 231      */
 232     int block_offset[2*(16+8)];
 233
 234     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 235     uint32_t *mb2b8_xy;
 236     int b_stride; //FIXME use s->b4_stride
 237     int b8_stride;
 238
 239     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 240     int mb_uvlinesize;
 241
 242     int emu_edge_width;
 243     int emu_edge_height;
 244
 245     int halfpel_flag;
 246     int thirdpel_flag;
 247
 248     int unknown_svq3_flag;
 249     int next_slice_index;
 250
 251     SPS sps_buffer[MAX_SPS_COUNT];
 252     SPS sps; ///< current sps
 253
 254     PPS pps_buffer[MAX_PPS_COUNT];
 255     /**
 256      * current pps
 257      */
 258     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 259
 260     uint32_t dequant4_buffer[6][52][16];
 261     uint32_t dequant8_buffer[2][52][64];
 262     uint32_t (*dequant4_coeff[6])[16];
 263     uint32_t (*dequant8_coeff[2])[64];
 264     int dequant_coeff_pps;     ///< reinit tables when pps changes
 265
 266     int slice_num;
 267     uint8_t *slice_table_base;
 268     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 269     int slice_type;
 270     int slice_type_fixed;
 271
 272     //interlacing specific flags
 273     int mb_aff_frame;
 274     int mb_field_decoding_flag;
 275     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 276
 277     unsigned int sub_mb_type[4];
 278
 279     //POC stuff
 280     int poc_lsb;
 281     int poc_msb;
 282     int delta_poc_bottom;
 283     int delta_poc[2];
 284     int frame_num;
 285     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 286     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 287     int frame_num_offset;         ///< for POC type 2
 288     int prev_frame_num_offset;    ///< for POC type 2
 289     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 290
 291     /**
 292      * frame_num for frames or 2*frame_num for field pics.
 293      */
 294     int curr_pic_num;
 295
 296     /**
 297      * max_frame_num or 2*max_frame_num for field pics.
 298      */
 299     int max_pic_num;
 300
 301     //Weighted pred stuff
 302     int use_weight;
 303     int use_weight_chroma;
 304     int luma_log2_weight_denom;
 305     int chroma_log2_weight_denom;
 306     int luma_weight[2][48];
 307     int luma_offset[2][48];
 308     int chroma_weight[2][48][2];
 309     int chroma_offset[2][48][2];
 310     int implicit_weight[48][48];
 311
 312     //deblock
 313     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 314     int slice_alpha_c0_offset;
 315     int slice_beta_offset;
 316
 317     int redundant_pic_count;
 318
 319     int direct_spatial_mv_pred;
 320     int dist_scale_factor[16];
 321     int dist_scale_factor_field[32];
 322     int map_col_to_list0[2][16];
 323     int map_col_to_list0_field[2][32];
 324
 325     /**
 326      * num_ref_idx_l0/1_active_minus1 + 1
 327      */
 328     unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
 329     unsigned int list_count;
 330     Picture *short_ref[32];
 331     Picture *long_ref[32];
 332     Picture default_ref_list[2][32];
 333     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 334     Picture *delayed_pic[18]; //FIXME size?
 335     Picture *delayed_output_pic;
 336
 337     /**
 338      * memory management control operations buffer.
 339      */
 340     MMCO mmco[MAX_MMCO_COUNT];
 341     int mmco_index;
 342
 343     int long_ref_count;  ///< number of actual long term references
 344     int short_ref_count; ///< number of actual short term references
 345
 346     //data partitioning
 347     GetBitContext intra_gb;
 348     GetBitContext inter_gb;
 349     GetBitContext *intra_gb_ptr;
 350     GetBitContext *inter_gb_ptr;
 351
 352     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 353     DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
 354
 355     /**
 356      * Cabac
 357      */
 358     CABACContext cabac;
 359     uint8_t      cabac_state[460];
 360     int          cabac_init_idc;
 361
 362     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 363     uint16_t     *cbp_table;
 364     int cbp;
 365     int top_cbp;
 366     int left_cbp;
 367     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 368     uint8_t     *chroma_pred_mode_table;
 369     int         last_qscale_diff;
 370     int16_t     (*mvd_table[2])[2];
 371     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 372     uint8_t     *direct_table;
 373     uint8_t     direct_cache[5*8];
 374
 375     uint8_t zigzag_scan[16];
 376     uint8_t zigzag_scan8x8[64];
 377     uint8_t zigzag_scan8x8_cavlc[64];
 378     uint8_t field_scan[16];
 379     uint8_t field_scan8x8[64];
 380     uint8_t field_scan8x8_cavlc[64];
 381     const uint8_t *zigzag_scan_q0;
 382     const uint8_t *zigzag_scan8x8_q0;
 383     const uint8_t *zigzag_scan8x8_cavlc_q0;
 384     const uint8_t *field_scan_q0;
 385     const uint8_t *field_scan8x8_q0;
 386     const uint8_t *field_scan8x8_cavlc_q0;
 387
 388     int x264_build;
 389 }H264Context;
 390
 391 static VLC coeff_token_vlc[4];
 392 static VLC chroma_dc_coeff_token_vlc;
 393
 394 static VLC total_zeros_vlc[15];
 395 static VLC chroma_dc_total_zeros_vlc[3];
 396
 397 static VLC run_vlc[6];
 398 static VLC run7_vlc;
 399
 400 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 401 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 402 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 403 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 404
 405 static av_always_inline uint32_t pack16to32(int a, int b){
 406 #ifdef WORDS_BIGENDIAN
 407    return (b&0xFFFF) + (a<<16);
 408 #else
 409    return (a&0xFFFF) + (b<<16);
 410 #endif
 411 }
 412
 413 const uint8_t ff_rem6[52]={
 414 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
 415 };
 416
 417 const uint8_t ff_div6[52]={
 418 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
 419 };
 420
 421
 422 /**
 423  * fill a rectangle.
 424  * @param h height of the rectangle, should be a constant
 425  * @param w width of the rectangle, should be a constant
 426  * @param size the size of val (1 or 4), should be a constant
 427  */
 428 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 429     uint8_t *p= (uint8_t*)vp;
 430     assert(size==1 || size==4);
 431     assert(w<=4);
 432
 433     w      *= size;
 434     stride *= size;
 435
 436     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 437     assert((stride&(w-1))==0);
 438     if(w==2){
 439         const uint16_t v= size==4 ? val : val*0x0101;
 440         *(uint16_t*)(p + 0*stride)= v;
 441         if(h==1) return;
 442         *(uint16_t*)(p + 1*stride)= v;
 443         if(h==2) return;
 444         *(uint16_t*)(p + 2*stride)=
 445         *(uint16_t*)(p + 3*stride)= v;
 446     }else if(w==4){
 447         const uint32_t v= size==4 ? val : val*0x01010101;
 448         *(uint32_t*)(p + 0*stride)= v;
 449         if(h==1) return;
 450         *(uint32_t*)(p + 1*stride)= v;
 451         if(h==2) return;
 452         *(uint32_t*)(p + 2*stride)=
 453         *(uint32_t*)(p + 3*stride)= v;
 454     }else if(w==8){
 455     //gcc can't optimize 64bit math on x86_32
 456 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 457         const uint64_t v= val*0x0100000001ULL;
 458         *(uint64_t*)(p + 0*stride)= v;
 459         if(h==1) return;
 460         *(uint64_t*)(p + 1*stride)= v;
 461         if(h==2) return;
 462         *(uint64_t*)(p + 2*stride)=
 463         *(uint64_t*)(p + 3*stride)= v;
 464     }else if(w==16){
 465         const uint64_t v= val*0x0100000001ULL;
 466         *(uint64_t*)(p + 0+0*stride)=
 467         *(uint64_t*)(p + 8+0*stride)=
 468         *(uint64_t*)(p + 0+1*stride)=
 469         *(uint64_t*)(p + 8+1*stride)= v;
 470         if(h==2) return;
 471         *(uint64_t*)(p + 0+2*stride)=
 472         *(uint64_t*)(p + 8+2*stride)=
 473         *(uint64_t*)(p + 0+3*stride)=
 474         *(uint64_t*)(p + 8+3*stride)= v;
 475 #else
 476         *(uint32_t*)(p + 0+0*stride)=
 477         *(uint32_t*)(p + 4+0*stride)= val;
 478         if(h==1) return;
 479         *(uint32_t*)(p + 0+1*stride)=
 480         *(uint32_t*)(p + 4+1*stride)= val;
 481         if(h==2) return;
 482         *(uint32_t*)(p + 0+2*stride)=
 483         *(uint32_t*)(p + 4+2*stride)=
 484         *(uint32_t*)(p + 0+3*stride)=
 485         *(uint32_t*)(p + 4+3*stride)= val;
 486     }else if(w==16){
 487         *(uint32_t*)(p + 0+0*stride)=
 488         *(uint32_t*)(p + 4+0*stride)=
 489         *(uint32_t*)(p + 8+0*stride)=
 490         *(uint32_t*)(p +12+0*stride)=
 491         *(uint32_t*)(p + 0+1*stride)=
 492         *(uint32_t*)(p + 4+1*stride)=
 493         *(uint32_t*)(p + 8+1*stride)=
 494         *(uint32_t*)(p +12+1*stride)= val;
 495         if(h==2) return;
 496         *(uint32_t*)(p + 0+2*stride)=
 497         *(uint32_t*)(p + 4+2*stride)=
 498         *(uint32_t*)(p + 8+2*stride)=
 499         *(uint32_t*)(p +12+2*stride)=
 500         *(uint32_t*)(p + 0+3*stride)=
 501         *(uint32_t*)(p + 4+3*stride)=
 502         *(uint32_t*)(p + 8+3*stride)=
 503         *(uint32_t*)(p +12+3*stride)= val;
 504 #endif
 505     }else
 506         assert(0);
 507     assert(h==4);
 508 }
 509
 510 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 511     MpegEncContext * const s = &h->s;
 512     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 513     int topleft_xy, top_xy, topright_xy, left_xy[2];
 514     int topleft_type, top_type, topright_type, left_type[2];
 515     int left_block[8];
 516     int i;
 517
 518     //FIXME deblocking could skip the intra and nnz parts.
 519     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 520         return;
 521
 522     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 523
 524     top_xy     = mb_xy  - s->mb_stride;
 525     topleft_xy = top_xy - 1;
 526     topright_xy= top_xy + 1;
 527     left_xy[1] = left_xy[0] = mb_xy-1;
 528     left_block[0]= 0;
 529     left_block[1]= 1;
 530     left_block[2]= 2;
 531     left_block[3]= 3;
 532     left_block[4]= 7;
 533     left_block[5]= 10;
 534     left_block[6]= 8;
 535     left_block[7]= 11;
 536     if(FRAME_MBAFF){
 537         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 538         const int top_pair_xy      = pair_xy     - s->mb_stride;
 539         const int topleft_pair_xy  = top_pair_xy - 1;
 540         const int topright_pair_xy = top_pair_xy + 1;
 541         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 542         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 543         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 544         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 545         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 546         const int bottom = (s->mb_y & 1);
 547         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 548         if (bottom
 549                 ? !curr_mb_frame_flag // bottom macroblock
 550                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 551                 ) {
 552             top_xy -= s->mb_stride;
 553         }
 554         if (bottom
 555                 ? !curr_mb_frame_flag // bottom macroblock
 556                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 557                 ) {
 558             topleft_xy -= s->mb_stride;
 559         }
 560         if (bottom
 561                 ? !curr_mb_frame_flag // bottom macroblock
 562                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 563                 ) {
 564             topright_xy -= s->mb_stride;
 565         }
 566         if (left_mb_frame_flag != curr_mb_frame_flag) {
 567             left_xy[1] = left_xy[0] = pair_xy - 1;
 568             if (curr_mb_frame_flag) {
 569                 if (bottom) {
 570                     left_block[0]= 2;
 571                     left_block[1]= 2;
 572                     left_block[2]= 3;
 573                     left_block[3]= 3;
 574                     left_block[4]= 8;
 575                     left_block[5]= 11;
 576                     left_block[6]= 8;
 577                     left_block[7]= 11;
 578                 } else {
 579                     left_block[0]= 0;
 580                     left_block[1]= 0;
 581                     left_block[2]= 1;
 582                     left_block[3]= 1;
 583                     left_block[4]= 7;
 584                     left_block[5]= 10;
 585                     left_block[6]= 7;
 586                     left_block[7]= 10;
 587                 }
 588             } else {
 589                 left_xy[1] += s->mb_stride;
 590                 //left_block[0]= 0;
 591                 left_block[1]= 2;
 592                 left_block[2]= 0;
 593                 left_block[3]= 2;
 594                 //left_block[4]= 7;
 595                 left_block[5]= 10;
 596                 left_block[6]= 7;
 597                 left_block[7]= 10;
 598             }
 599         }
 600     }
 601
 602     h->top_mb_xy = top_xy;
 603     h->left_mb_xy[0] = left_xy[0];
 604     h->left_mb_xy[1] = left_xy[1];
 605     if(for_deblock){
 606         topleft_type = 0;
 607         topright_type = 0;
 608         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 609         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 610         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 611
 612         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 613             int list;
 614             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 615             for(i=0; i<16; i++)
 616                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 617             for(list=0; list<h->list_count; list++){
 618                 if(USES_LIST(mb_type,list)){
 619                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 620                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 621                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 622                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 623                         dst[0] = src[0];
 624                         dst[1] = src[1];
 625                         dst[2] = src[2];
 626                         dst[3] = src[3];
 627                     }
 628                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 629                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 630                     ref += h->b8_stride;
 631                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 632                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 633                 }else{
 634                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 635                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 636                 }
 637             }
 638         }
 639     }else{
 640         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 641         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 642         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 643         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 644         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 645     }
 646
 647     if(IS_INTRA(mb_type)){
 648         h->topleft_samples_available=
 649         h->top_samples_available=
 650         h->left_samples_available= 0xFFFF;
 651         h->topright_samples_available= 0xEEEA;
 652
 653         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 654             h->topleft_samples_available= 0xB3FF;
 655             h->top_samples_available= 0x33FF;
 656             h->topright_samples_available= 0x26EA;
 657         }
 658         for(i=0; i<2; i++){
 659             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 660                 h->topleft_samples_available&= 0xDF5F;
 661                 h->left_samples_available&= 0x5F5F;
 662             }
 663         }
 664
 665         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 666             h->topleft_samples_available&= 0x7FFF;
 667
 668         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 669             h->topright_samples_available&= 0xFBFF;
 670
 671         if(IS_INTRA4x4(mb_type)){
 672             if(IS_INTRA4x4(top_type)){
 673                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 674                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 675                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 676                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 677             }else{
 678                 int pred;
 679                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 680                     pred= -1;
 681                 else{
 682                     pred= 2;
 683                 }
 684                 h->intra4x4_pred_mode_cache[4+8*0]=
 685                 h->intra4x4_pred_mode_cache[5+8*0]=
 686                 h->intra4x4_pred_mode_cache[6+8*0]=
 687                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 688             }
 689             for(i=0; i<2; i++){
 690                 if(IS_INTRA4x4(left_type[i])){
 691                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 692                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 693                 }else{
 694                     int pred;
 695                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 696                         pred= -1;
 697                     else{
 698                         pred= 2;
 699                     }
 700                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 701                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 702                 }
 703             }
 704         }
 705     }
 706
 707
 708 /*
 709 0 . T T. T T T T
 710 1 L . .L . . . .
 711 2 L . .L . . . .
 712 3 . T TL . . . .
 713 4 L . .L . . . .
 714 5 L . .. . . . .
 715 */
 716 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 717     if(top_type){
 718         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 719         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 720         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 721         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 722
 723         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 724         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 725
 726         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 727         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 728
 729     }else{
 730         h->non_zero_count_cache[4+8*0]=
 731         h->non_zero_count_cache[5+8*0]=
 732         h->non_zero_count_cache[6+8*0]=
 733         h->non_zero_count_cache[7+8*0]=
 734
 735         h->non_zero_count_cache[1+8*0]=
 736         h->non_zero_count_cache[2+8*0]=
 737
 738         h->non_zero_count_cache[1+8*3]=
 739         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 740
 741     }
 742
 743     for (i=0; i<2; i++) {
 744         if(left_type[i]){
 745             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 746             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 747             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 748             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 749         }else{
 750             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 751             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 752             h->non_zero_count_cache[0+8*1 +   8*i]=
 753             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 754         }
 755     }
 756
 757     if( h->pps.cabac ) {
 758         // top_cbp
 759         if(top_type) {
 760             h->top_cbp = h->cbp_table[top_xy];
 761         } else if(IS_INTRA(mb_type)) {
 762             h->top_cbp = 0x1C0;
 763         } else {
 764             h->top_cbp = 0;
 765         }
 766         // left_cbp
 767         if (left_type[0]) {
 768             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 769         } else if(IS_INTRA(mb_type)) {
 770             h->left_cbp = 0x1C0;
 771         } else {
 772             h->left_cbp = 0;
 773         }
 774         if (left_type[0]) {
 775             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 776         }
 777         if (left_type[1]) {
 778             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 779         }
 780     }
 781
 782 #if 1
 783     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 784         int list;
 785         for(list=0; list<h->list_count; list++){
 786             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 787                 /*if(!h->mv_cache_clean[list]){
 788                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 789                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 790                     h->mv_cache_clean[list]= 1;
 791                 }*/
 792                 continue;
 793             }
 794             h->mv_cache_clean[list]= 0;
 795
 796             if(USES_LIST(top_type, list)){
 797                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 798                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 799                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 800                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 801                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 802                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 803                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 804                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 805                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 806                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 807             }else{
 808                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 809                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 810                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 811                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 812                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 813             }
 814
 815             for(i=0; i<2; i++){
 816                 int cache_idx = scan8[0] - 1 + i*2*8;
 817                 if(USES_LIST(left_type[i], list)){
 818                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 819                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 820                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 821                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 822                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 823                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 824                 }else{
 825                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 826                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 827                     h->ref_cache[list][cache_idx  ]=
 828                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 829                 }
 830             }
 831
 832             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 833                 continue;
 834
 835             if(USES_LIST(topleft_type, list)){
 836                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 837                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 838                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 839                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 840             }else{
 841                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 842                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 843             }
 844
 845             if(USES_LIST(topright_type, list)){
 846                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 847                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 848                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 849                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 850             }else{
 851                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 852                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 853             }
 854
 855             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 856                 continue;
 857
 858             h->ref_cache[list][scan8[5 ]+1] =
 859             h->ref_cache[list][scan8[7 ]+1] =
 860             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 861             h->ref_cache[list][scan8[4 ]] =
 862             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 863             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 864             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 865             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 866             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 867             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 868
 869             if( h->pps.cabac ) {
 870                 /* XXX beurk, Load mvd */
 871                 if(USES_LIST(top_type, list)){
 872                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 873                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 874                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 875                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 876                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 877                 }else{
 878                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 879                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 880                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 881                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 882                 }
 883                 if(USES_LIST(left_type[0], list)){
 884                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 885                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 886                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 887                 }else{
 888                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 889                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 890                 }
 891                 if(USES_LIST(left_type[1], list)){
 892                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 893                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 894                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 895                 }else{
 896                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 897                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 898                 }
 899                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 900                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 901                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 902                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 903                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 904
 905                 if(h->slice_type == B_TYPE){
 906                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 907
 908                     if(IS_DIRECT(top_type)){
 909                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 910                     }else if(IS_8X8(top_type)){
 911                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 912                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 913                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 914                     }else{
 915                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 916                     }
 917
 918                     if(IS_DIRECT(left_type[0]))
 919                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 920                     else if(IS_8X8(left_type[0]))
 921                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 922                     else
 923                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 924
 925                     if(IS_DIRECT(left_type[1]))
 926                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 927                     else if(IS_8X8(left_type[1]))
 928                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 929                     else
 930                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 931                 }
 932             }
 933
 934             if(FRAME_MBAFF){
 935 #define MAP_MVS\
 936                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 937                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 938                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 939                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 940                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 941                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 942                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 943                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 944                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 945                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 946                 if(MB_FIELD){
 947 #define MAP_F2F(idx, mb_type)\
 948                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 949                         h->ref_cache[list][idx] <<= 1;\
 950                         h->mv_cache[list][idx][1] /= 2;\
 951                         h->mvd_cache[list][idx][1] /= 2;\
 952                     }
 953                     MAP_MVS
 954 #undef MAP_F2F
 955                 }else{
 956 #define MAP_F2F(idx, mb_type)\
 957                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 958                         h->ref_cache[list][idx] >>= 1;\
 959                         h->mv_cache[list][idx][1] <<= 1;\
 960                         h->mvd_cache[list][idx][1] <<= 1;\
 961                     }
 962                     MAP_MVS
 963 #undef MAP_F2F
 964                 }
 965             }
 966         }
 967     }
 968 #endif
 969
 970     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 971 }
 972
 973 static inline void write_back_intra_pred_mode(H264Context *h){
 974     MpegEncContext * const s = &h->s;
 975     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 976
 977     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 978     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 979     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 980     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 981     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 982     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 983     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 984 }
 985
 986 /**
 987  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 988  */
 989 static inline int check_intra4x4_pred_mode(H264Context *h){
 990     MpegEncContext * const s = &h->s;
 991     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 992     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 993     int i;
 994
 995     if(!(h->top_samples_available&0x8000)){
 996         for(i=0; i<4; i++){
 997             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 998             if(status<0){
 999                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1000                 return -1;
1001             } else if(status){
1002                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1003             }
1004         }
1005     }
1006
1007     if(!(h->left_samples_available&0x8000)){
1008         for(i=0; i<4; i++){
1009             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1010             if(status<0){
1011                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1012                 return -1;
1013             } else if(status){
1014                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1015             }
1016         }
1017     }
1018
1019     return 0;
1020 } //FIXME cleanup like next
1021
1022 /**
1023  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1024  */
1025 static inline int check_intra_pred_mode(H264Context *h, int mode){
1026     MpegEncContext * const s = &h->s;
1027     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1028     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1029
1030     if(mode > 6U) {
1031         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1032         return -1;
1033     }
1034
1035     if(!(h->top_samples_available&0x8000)){
1036         mode= top[ mode ];
1037         if(mode<0){
1038             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1039             return -1;
1040         }
1041     }
1042
1043     if(!(h->left_samples_available&0x8000)){
1044         mode= left[ mode ];
1045         if(mode<0){
1046             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1047             return -1;
1048         }
1049     }
1050
1051     return mode;
1052 }
1053
1054 /**
1055  * gets the predicted intra4x4 prediction mode.
1056  */
1057 static inline int pred_intra_mode(H264Context *h, int n){
1058     const int index8= scan8[n];
1059     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1060     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1061     const int min= FFMIN(left, top);
1062
1063     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
1064
1065     if(min<0) return DC_PRED;
1066     else      return min;
1067 }
1068
1069 static inline void write_back_non_zero_count(H264Context *h){
1070     MpegEncContext * const s = &h->s;
1071     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1072
1073     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1074     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1075     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1076     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1077     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1078     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1079     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1080
1081     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1082     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1083     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1084
1085     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1086     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1087     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1088
1089     if(FRAME_MBAFF){
1090         // store all luma nnzs, for deblocking
1091         int v = 0, i;
1092         for(i=0; i<16; i++)
1093             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1094         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1095     }
1096 }
1097
1098 /**
1099  * gets the predicted number of non zero coefficients.
1100  * @param n block index
1101  */
1102 static inline int pred_non_zero_count(H264Context *h, int n){
1103     const int index8= scan8[n];
1104     const int left= h->non_zero_count_cache[index8 - 1];
1105     const int top = h->non_zero_count_cache[index8 - 8];
1106     int i= left + top;
1107
1108     if(i<64) i= (i+1)>>1;
1109
1110     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1111
1112     return i&31;
1113 }
1114
1115 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1116     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1117     MpegEncContext *s = &h->s;
1118
1119     /* there is no consistent mapping of mvs to neighboring locations that will
1120      * make mbaff happy, so we can't move all this logic to fill_caches */
1121     if(FRAME_MBAFF){
1122         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1123         const int16_t *mv;
1124         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1125         *C = h->mv_cache[list][scan8[0]-2];
1126
1127         if(!MB_FIELD
1128            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1129             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1130             if(IS_INTERLACED(mb_types[topright_xy])){
1131 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1132                 const int x4 = X4, y4 = Y4;\
1133                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1134                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1135                     return LIST_NOT_USED;\
1136                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1137                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1138                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1139                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1140
1141                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1142             }
1143         }
1144         if(topright_ref == PART_NOT_AVAILABLE
1145            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1146            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1147             if(!MB_FIELD
1148                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1149                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1150             }
1151             if(MB_FIELD
1152                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1153                && i >= scan8[0]+8){
1154                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1155                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1156             }
1157         }
1158 #undef SET_DIAG_MV
1159     }
1160
1161     if(topright_ref != PART_NOT_AVAILABLE){
1162         *C= h->mv_cache[list][ i - 8 + part_width ];
1163         return topright_ref;
1164     }else{
1165         tprintf(s->avctx, "topright MV not available\n");
1166
1167         *C= h->mv_cache[list][ i - 8 - 1 ];
1168         return h->ref_cache[list][ i - 8 - 1 ];
1169     }
1170 }
1171
1172 /**
1173  * gets the predicted MV.
1174  * @param n the block index
1175  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1176  * @param mx the x component of the predicted motion vector
1177  * @param my the y component of the predicted motion vector
1178  */
1179 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1180     const int index8= scan8[n];
1181     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1182     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1183     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1184     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1185     const int16_t * C;
1186     int diagonal_ref, match_count;
1187
1188     assert(part_width==1 || part_width==2 || part_width==4);
1189
1190 /* mv_cache
1191   B . . A T T T T
1192   U . . L . . , .
1193   U . . L . . . .
1194   U . . L . . , .
1195   . . . L . . . .
1196 */
1197
1198     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1199     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1200     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
1201     if(match_count > 1){ //most common
1202         *mx= mid_pred(A[0], B[0], C[0]);
1203         *my= mid_pred(A[1], B[1], C[1]);
1204     }else if(match_count==1){
1205         if(left_ref==ref){
1206             *mx= A[0];
1207             *my= A[1];
1208         }else if(top_ref==ref){
1209             *mx= B[0];
1210             *my= B[1];
1211         }else{
1212             *mx= C[0];
1213             *my= C[1];
1214         }
1215     }else{
1216         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1217             *mx= A[0];
1218             *my= A[1];
1219         }else{
1220             *mx= mid_pred(A[0], B[0], C[0]);
1221             *my= mid_pred(A[1], B[1], C[1]);
1222         }
1223     }
1224
1225     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1226 }
1227
1228 /**
1229  * gets the directionally predicted 16x8 MV.
1230  * @param n the block index
1231  * @param mx the x component of the predicted motion vector
1232  * @param my the y component of the predicted motion vector
1233  */
1234 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1235     if(n==0){
1236         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1237         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1238
1239         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1240
1241         if(top_ref == ref){
1242             *mx= B[0];
1243             *my= B[1];
1244             return;
1245         }
1246     }else{
1247         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1248         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1249
1250         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1251
1252         if(left_ref == ref){
1253             *mx= A[0];
1254             *my= A[1];
1255             return;
1256         }
1257     }
1258
1259     //RARE
1260     pred_motion(h, n, 4, list, ref, mx, my);
1261 }
1262
1263 /**
1264  * gets the directionally predicted 8x16 MV.
1265  * @param n the block index
1266  * @param mx the x component of the predicted motion vector
1267  * @param my the y component of the predicted motion vector
1268  */
1269 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1270     if(n==0){
1271         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1272         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1273
1274         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1275
1276         if(left_ref == ref){
1277             *mx= A[0];
1278             *my= A[1];
1279             return;
1280         }
1281     }else{
1282         const int16_t * C;
1283         int diagonal_ref;
1284
1285         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1286
1287         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1288
1289         if(diagonal_ref == ref){
1290             *mx= C[0];
1291             *my= C[1];
1292             return;
1293         }
1294     }
1295
1296     //RARE
1297     pred_motion(h, n, 2, list, ref, mx, my);
1298 }
1299
1300 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1301     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1302     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1303
1304     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1305
1306     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1307        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1308        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1309
1310         *mx = *my = 0;
1311         return;
1312     }
1313
1314     pred_motion(h, 0, 4, 0, 0, mx, my);
1315
1316     return;
1317 }
1318
1319 static inline void direct_dist_scale_factor(H264Context * const h){
1320     const int poc = h->s.current_picture_ptr->poc;
1321     const int poc1 = h->ref_list[1][0].poc;
1322     int i;
1323     for(i=0; i<h->ref_count[0]; i++){
1324         int poc0 = h->ref_list[0][i].poc;
1325         int td = av_clip(poc1 - poc0, -128, 127);
1326         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1327             h->dist_scale_factor[i] = 256;
1328         }else{
1329             int tb = av_clip(poc - poc0, -128, 127);
1330             int tx = (16384 + (FFABS(td) >> 1)) / td;
1331             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
1332         }
1333     }
1334     if(FRAME_MBAFF){
1335         for(i=0; i<h->ref_count[0]; i++){
1336             h->dist_scale_factor_field[2*i] =
1337             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1338         }
1339     }
1340 }
1341 static inline void direct_ref_list_init(H264Context * const h){
1342     MpegEncContext * const s = &h->s;
1343     Picture * const ref1 = &h->ref_list[1][0];
1344     Picture * const cur = s->current_picture_ptr;
1345     int list, i, j;
1346     if(cur->pict_type == I_TYPE)
1347         cur->ref_count[0] = 0;
1348     if(cur->pict_type != B_TYPE)
1349         cur->ref_count[1] = 0;
1350     for(list=0; list<2; list++){
1351         cur->ref_count[list] = h->ref_count[list];
1352         for(j=0; j<h->ref_count[list]; j++)
1353             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1354     }
1355     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1356         return;
1357     for(list=0; list<2; list++){
1358         for(i=0; i<ref1->ref_count[list]; i++){
1359             const int poc = ref1->ref_poc[list][i];
1360             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1361             for(j=0; j<h->ref_count[list]; j++)
1362                 if(h->ref_list[list][j].poc == poc){
1363                     h->map_col_to_list0[list][i] = j;
1364                     break;
1365                 }
1366         }
1367     }
1368     if(FRAME_MBAFF){
1369         for(list=0; list<2; list++){
1370             for(i=0; i<ref1->ref_count[list]; i++){
1371                 j = h->map_col_to_list0[list][i];
1372                 h->map_col_to_list0_field[list][2*i] = 2*j;
1373                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1374             }
1375         }
1376     }
1377 }
1378
1379 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1380     MpegEncContext * const s = &h->s;
1381     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1382     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1383     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1384     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1385     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1386     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1387     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1388     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1389     const int is_b8x8 = IS_8X8(*mb_type);
1390     unsigned int sub_mb_type;
1391     int i8, i4;
1392
1393 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1394     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1395         /* FIXME save sub mb types from previous frames (or derive from MVs)
1396          * so we know exactly what block size to use */
1397         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1398         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1399     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1400         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1401         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1402     }else{
1403         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1404         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1405     }
1406     if(!is_b8x8)
1407         *mb_type |= MB_TYPE_DIRECT2;
1408     if(MB_FIELD)
1409         *mb_type |= MB_TYPE_INTERLACED;
1410
1411     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1412
1413     if(h->direct_spatial_mv_pred){
1414         int ref[2];
1415         int mv[2][2];
1416         int list;
1417
1418         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1419
1420         /* ref = min(neighbors) */
1421         for(list=0; list<2; list++){
1422             int refa = h->ref_cache[list][scan8[0] - 1];
1423             int refb = h->ref_cache[list][scan8[0] - 8];
1424             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1425             if(refc == -2)
1426                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1427             ref[list] = refa;
1428             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1429                 ref[list] = refb;
1430             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1431                 ref[list] = refc;
1432             if(ref[list] < 0)
1433                 ref[list] = -1;
1434         }
1435
1436         if(ref[0] < 0 && ref[1] < 0){
1437             ref[0] = ref[1] = 0;
1438             mv[0][0] = mv[0][1] =
1439             mv[1][0] = mv[1][1] = 0;
1440         }else{
1441             for(list=0; list<2; list++){
1442                 if(ref[list] >= 0)
1443                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1444                 else
1445                     mv[list][0] = mv[list][1] = 0;
1446             }
1447         }
1448
1449         if(ref[1] < 0){
1450             *mb_type &= ~MB_TYPE_P0L1;
1451             sub_mb_type &= ~MB_TYPE_P0L1;
1452         }else if(ref[0] < 0){
1453             *mb_type &= ~MB_TYPE_P0L0;
1454             sub_mb_type &= ~MB_TYPE_P0L0;
1455         }
1456
1457         if(IS_16X16(*mb_type)){
1458             int a=0, b=0;
1459
1460             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1461             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1462             if(!IS_INTRA(mb_type_col)
1463                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1464                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1465                        && (h->x264_build>33 || !h->x264_build)))){
1466                 if(ref[0] > 0)
1467                     a= pack16to32(mv[0][0],mv[0][1]);
1468                 if(ref[1] > 0)
1469                     b= pack16to32(mv[1][0],mv[1][1]);
1470             }else{
1471                 a= pack16to32(mv[0][0],mv[0][1]);
1472                 b= pack16to32(mv[1][0],mv[1][1]);
1473             }
1474             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1475             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1476         }else{
1477             for(i8=0; i8<4; i8++){
1478                 const int x8 = i8&1;
1479                 const int y8 = i8>>1;
1480
1481                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1482                     continue;
1483                 h->sub_mb_type[i8] = sub_mb_type;
1484
1485                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1486                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1487                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1488                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1489
1490                 /* col_zero_flag */
1491                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1492                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1493                                                   && (h->x264_build>33 || !h->x264_build)))){
1494                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1495                     if(IS_SUB_8X8(sub_mb_type)){
1496                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1497                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1498                             if(ref[0] == 0)
1499                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1500                             if(ref[1] == 0)
1501                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1502                         }
1503                     }else
1504                     for(i4=0; i4<4; i4++){
1505                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1506                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1507                             if(ref[0] == 0)
1508                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1509                             if(ref[1] == 0)
1510                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1511                         }
1512                     }
1513                 }
1514             }
1515         }
1516     }else{ /* direct temporal mv pred */
1517         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1518         const int *dist_scale_factor = h->dist_scale_factor;
1519
1520         if(FRAME_MBAFF){
1521             if(IS_INTERLACED(*mb_type)){
1522                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1523                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1524                 dist_scale_factor = h->dist_scale_factor_field;
1525             }
1526             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1527                 /* FIXME assumes direct_8x8_inference == 1 */
1528                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1529                 int mb_types_col[2];
1530                 int y_shift;
1531
1532                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1533                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1534                          | (*mb_type & MB_TYPE_INTERLACED);
1535                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1536
1537                 if(IS_INTERLACED(*mb_type)){
1538                     /* frame to field scaling */
1539                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1540                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1541                     if(s->mb_y&1){
1542                         l1ref0 -= 2*h->b8_stride;
1543                         l1ref1 -= 2*h->b8_stride;
1544                         l1mv0 -= 4*h->b_stride;
1545                         l1mv1 -= 4*h->b_stride;
1546                     }
1547                     y_shift = 0;
1548
1549                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1550                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1551                        && !is_b8x8)
1552                         *mb_type |= MB_TYPE_16x8;
1553                     else
1554                         *mb_type |= MB_TYPE_8x8;
1555                 }else{
1556                     /* field to frame scaling */
1557                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1558                      * but in MBAFF, top and bottom POC are equal */
1559                     int dy = (s->mb_y&1) ? 1 : 2;
1560                     mb_types_col[0] =
1561                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1562                     l1ref0 += dy*h->b8_stride;
1563                     l1ref1 += dy*h->b8_stride;
1564                     l1mv0 += 2*dy*h->b_stride;
1565                     l1mv1 += 2*dy*h->b_stride;
1566                     y_shift = 2;
1567
1568                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1569                        && !is_b8x8)
1570                         *mb_type |= MB_TYPE_16x16;
1571                     else
1572                         *mb_type |= MB_TYPE_8x8;
1573                 }
1574
1575                 for(i8=0; i8<4; i8++){
1576                     const int x8 = i8&1;
1577                     const int y8 = i8>>1;
1578                     int ref0, scale;
1579                     const int16_t (*l1mv)[2]= l1mv0;
1580
1581                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1582                         continue;
1583                     h->sub_mb_type[i8] = sub_mb_type;
1584
1585                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1586                     if(IS_INTRA(mb_types_col[y8])){
1587                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1588                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1589                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1590                         continue;
1591                     }
1592
1593                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1594                     if(ref0 >= 0)
1595                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1596                     else{
1597                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1598                         l1mv= l1mv1;
1599                     }
1600                     scale = dist_scale_factor[ref0];
1601                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1602
1603                     {
1604                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1605                         int my_col = (mv_col[1]<<y_shift)/2;
1606                         int mx = (scale * mv_col[0] + 128) >> 8;
1607                         int my = (scale * my_col + 128) >> 8;
1608                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1609                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1610                     }
1611                 }
1612                 return;
1613             }
1614         }
1615
1616         /* one-to-one mv scaling */
1617
1618         if(IS_16X16(*mb_type)){
1619             int ref, mv0, mv1;
1620
1621             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1622             if(IS_INTRA(mb_type_col)){
1623                 ref=mv0=mv1=0;
1624             }else{
1625                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1626                                                 : map_col_to_list0[1][l1ref1[0]];
1627                 const int scale = dist_scale_factor[ref0];
1628                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1629                 int mv_l0[2];
1630                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1631                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1632                 ref= ref0;
1633                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1634                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1635             }
1636             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1637             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1638             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1639         }else{
1640             for(i8=0; i8<4; i8++){
1641                 const int x8 = i8&1;
1642                 const int y8 = i8>>1;
1643                 int ref0, scale;
1644                 const int16_t (*l1mv)[2]= l1mv0;
1645
1646                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1647                     continue;
1648                 h->sub_mb_type[i8] = sub_mb_type;
1649                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1650                 if(IS_INTRA(mb_type_col)){
1651                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1652                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1653                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1654                     continue;
1655                 }
1656
1657                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1658                 if(ref0 >= 0)
1659                     ref0 = map_col_to_list0[0][ref0];
1660                 else{
1661                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1662                     l1mv= l1mv1;
1663                 }
1664                 scale = dist_scale_factor[ref0];
1665
1666                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1667                 if(IS_SUB_8X8(sub_mb_type)){
1668                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1669                     int mx = (scale * mv_col[0] + 128) >> 8;
1670                     int my = (scale * mv_col[1] + 128) >> 8;
1671                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1672                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1673                 }else
1674                 for(i4=0; i4<4; i4++){
1675                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1676                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1677                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1678                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1679                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1680                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1681                 }
1682             }
1683         }
1684     }
1685 }
1686
1687 static inline void write_back_motion(H264Context *h, int mb_type){
1688     MpegEncContext * const s = &h->s;
1689     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1690     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1691     int list;
1692
1693     if(!USES_LIST(mb_type, 0))
1694         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1695
1696     for(list=0; list<h->list_count; list++){
1697         int y;
1698         if(!USES_LIST(mb_type, list))
1699             continue;
1700
1701         for(y=0; y<4; y++){
1702             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1703             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1704         }
1705         if( h->pps.cabac ) {
1706             if(IS_SKIP(mb_type))
1707                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1708             else
1709             for(y=0; y<4; y++){
1710                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1711                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1712             }
1713         }
1714
1715         {
1716             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1717             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1718             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1719             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1720             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1721         }
1722     }
1723
1724     if(h->slice_type == B_TYPE && h->pps.cabac){
1725         if(IS_8X8(mb_type)){
1726             uint8_t *direct_table = &h->direct_table[b8_xy];
1727             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1728             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1729             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1730         }
1731     }
1732 }
1733
1734 /**
1735  * Decodes a network abstraction layer unit.
1736  * @param consumed is the number of bytes used as input
1737  * @param length is the length of the array
1738  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1739  * @returns decoded bytes, might be src+1 if no escapes
1740  */
1741 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1742     int i, si, di;
1743     uint8_t *dst;
1744
1745 //    src[0]&0x80;                //forbidden bit
1746     h->nal_ref_idc= src[0]>>5;
1747     h->nal_unit_type= src[0]&0x1F;
1748
1749     src++; length--;
1750 #if 0
1751     for(i=0; i<length; i++)
1752         printf("%2X ", src[i]);
1753 #endif
1754     for(i=0; i+1<length; i+=2){
1755         if(src[i]) continue;
1756         if(i>0 && src[i-1]==0) i--;
1757         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1758             if(src[i+2]!=3){
1759                 /* startcode, so we must be past the end */
1760                 length=i;
1761             }
1762             break;
1763         }
1764     }
1765
1766     if(i>=length-1){ //no escaped 0
1767         *dst_length= length;
1768         *consumed= length+1; //+1 for the header
1769         return src;
1770     }
1771
1772     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1773     dst= h->rbsp_buffer;
1774
1775     if (dst == NULL){
1776         return NULL;
1777     }
1778
1779 //printf("decoding esc\n");
1780     si=di=0;
1781     while(si<length){
1782         //remove escapes (very rare 1:2^22)
1783         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1784             if(src[si+2]==3){ //escape
1785                 dst[di++]= 0;
1786                 dst[di++]= 0;
1787                 si+=3;
1788                 continue;
1789             }else //next start code
1790                 break;
1791         }
1792
1793         dst[di++]= src[si++];
1794     }
1795
1796     *dst_length= di;
1797     *consumed= si + 1;//+1 for the header
1798 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1799     return dst;
1800 }
1801
1802 /**
1803  * identifies the exact end of the bitstream
1804  * @return the length of the trailing, or 0 if damaged
1805  */
1806 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1807     int v= *src;
1808     int r;
1809
1810     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1811
1812     for(r=1; r<9; r++){
1813         if(v&1) return r;
1814         v>>=1;
1815     }
1816     return 0;
1817 }
1818
1819 /**
1820  * idct tranforms the 16 dc values and dequantize them.
1821  * @param qp quantization parameter
1822  */
1823 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1824 #define stride 16
1825     int i;
1826     int temp[16]; //FIXME check if this is a good idea
1827     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1828     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1829
1830 //memset(block, 64, 2*256);
1831 //return;
1832     for(i=0; i<4; i++){
1833         const int offset= y_offset[i];
1834         const int z0= block[offset+stride*0] + block[offset+stride*4];
1835         const int z1= block[offset+stride*0] - block[offset+stride*4];
1836         const int z2= block[offset+stride*1] - block[offset+stride*5];
1837         const int z3= block[offset+stride*1] + block[offset+stride*5];
1838
1839         temp[4*i+0]= z0+z3;
1840         temp[4*i+1]= z1+z2;
1841         temp[4*i+2]= z1-z2;
1842         temp[4*i+3]= z0-z3;
1843     }
1844
1845     for(i=0; i<4; i++){
1846         const int offset= x_offset[i];
1847         const int z0= temp[4*0+i] + temp[4*2+i];
1848         const int z1= temp[4*0+i] - temp[4*2+i];
1849         const int z2= temp[4*1+i] - temp[4*3+i];
1850         const int z3= temp[4*1+i] + temp[4*3+i];
1851
1852         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1853         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1854         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1855         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1856     }
1857 }
1858
1859 #if 0
1860 /**
1861  * dct tranforms the 16 dc values.
1862  * @param qp quantization parameter ??? FIXME
1863  */
1864 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1865 //    const int qmul= dequant_coeff[qp][0];
1866     int i;
1867     int temp[16]; //FIXME check if this is a good idea
1868     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1869     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1870
1871     for(i=0; i<4; i++){
1872         const int offset= y_offset[i];
1873         const int z0= block[offset+stride*0] + block[offset+stride*4];
1874         const int z1= block[offset+stride*0] - block[offset+stride*4];
1875         const int z2= block[offset+stride*1] - block[offset+stride*5];
1876         const int z3= block[offset+stride*1] + block[offset+stride*5];
1877
1878         temp[4*i+0]= z0+z3;
1879         temp[4*i+1]= z1+z2;
1880         temp[4*i+2]= z1-z2;
1881         temp[4*i+3]= z0-z3;
1882     }
1883
1884     for(i=0; i<4; i++){
1885         const int offset= x_offset[i];
1886         const int z0= temp[4*0+i] + temp[4*2+i];
1887         const int z1= temp[4*0+i] - temp[4*2+i];
1888         const int z2= temp[4*1+i] - temp[4*3+i];
1889         const int z3= temp[4*1+i] + temp[4*3+i];
1890
1891         block[stride*0 +offset]= (z0 + z3)>>1;
1892         block[stride*2 +offset]= (z1 + z2)>>1;
1893         block[stride*8 +offset]= (z1 - z2)>>1;
1894         block[stride*10+offset]= (z0 - z3)>>1;
1895     }
1896 }
1897 #endif
1898
1899 #undef xStride
1900 #undef stride
1901
1902 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1903     const int stride= 16*2;
1904     const int xStride= 16;
1905     int a,b,c,d,e;
1906
1907     a= block[stride*0 + xStride*0];
1908     b= block[stride*0 + xStride*1];
1909     c= block[stride*1 + xStride*0];
1910     d= block[stride*1 + xStride*1];
1911
1912     e= a-b;
1913     a= a+b;
1914     b= c-d;
1915     c= c+d;
1916
1917     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1918     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1919     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1920     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1921 }
1922
1923 #if 0
1924 static void chroma_dc_dct_c(DCTELEM *block){
1925     const int stride= 16*2;
1926     const int xStride= 16;
1927     int a,b,c,d,e;
1928
1929     a= block[stride*0 + xStride*0];
1930     b= block[stride*0 + xStride*1];
1931     c= block[stride*1 + xStride*0];
1932     d= block[stride*1 + xStride*1];
1933
1934     e= a-b;
1935     a= a+b;
1936     b= c-d;
1937     c= c+d;
1938
1939     block[stride*0 + xStride*0]= (a+c);
1940     block[stride*0 + xStride*1]= (e+b);
1941     block[stride*1 + xStride*0]= (a-c);
1942     block[stride*1 + xStride*1]= (e-b);
1943 }
1944 #endif
1945
1946 /**
1947  * gets the chroma qp.
1948  */
1949 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1950
1951     return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
1952 }
1953
1954 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1955 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1956 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1957     int i;
1958     const int * const quant_table= quant_coeff[qscale];
1959     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1960     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1961     const unsigned int threshold2= (threshold1<<1);
1962     int last_non_zero;
1963
1964     if(separate_dc){
1965         if(qscale<=18){
1966             //avoid overflows
1967             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1968             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1969             const unsigned int dc_threshold2= (dc_threshold1<<1);
1970
1971             int level= block[0]*quant_coeff[qscale+18][0];
1972             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1973                 if(level>0){
1974                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1975                     block[0]= level;
1976                 }else{
1977                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1978                     block[0]= -level;
1979                 }
1980 //                last_non_zero = i;
1981             }else{
1982                 block[0]=0;
1983             }
1984         }else{
1985             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1986             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1987             const unsigned int dc_threshold2= (dc_threshold1<<1);
1988
1989             int level= block[0]*quant_table[0];
1990             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1991                 if(level>0){
1992                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1993                     block[0]= level;
1994                 }else{
1995                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1996                     block[0]= -level;
1997                 }
1998 //                last_non_zero = i;
1999             }else{
2000                 block[0]=0;
2001             }
2002         }
2003         last_non_zero= 0;
2004         i=1;
2005     }else{
2006         last_non_zero= -1;
2007         i=0;
2008     }
2009
2010     for(; i<16; i++){
2011         const int j= scantable[i];
2012         int level= block[j]*quant_table[j];
2013
2014 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2015 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2016         if(((unsigned)(level+threshold1))>threshold2){
2017             if(level>0){
2018                 level= (bias + level)>>QUANT_SHIFT;
2019                 block[j]= level;
2020             }else{
2021                 level= (bias - level)>>QUANT_SHIFT;
2022                 block[j]= -level;
2023             }
2024             last_non_zero = i;
2025         }else{
2026             block[j]=0;
2027         }
2028     }
2029
2030     return last_non_zero;
2031 }
2032
2033 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2034     const uint32_t a= ((uint32_t*)(src-stride))[0];
2035     ((uint32_t*)(src+0*stride))[0]= a;
2036     ((uint32_t*)(src+1*stride))[0]= a;
2037     ((uint32_t*)(src+2*stride))[0]= a;
2038     ((uint32_t*)(src+3*stride))[0]= a;
2039 }
2040
2041 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2042     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2043     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2044     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2045     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2046 }
2047
2048 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2049     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2050                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2051
2052     ((uint32_t*)(src+0*stride))[0]=
2053     ((uint32_t*)(src+1*stride))[0]=
2054     ((uint32_t*)(src+2*stride))[0]=
2055     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2056 }
2057
2058 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2059     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2060
2061     ((uint32_t*)(src+0*stride))[0]=
2062     ((uint32_t*)(src+1*stride))[0]=
2063     ((uint32_t*)(src+2*stride))[0]=
2064     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2065 }
2066
2067 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2068     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2069
2070     ((uint32_t*)(src+0*stride))[0]=
2071     ((uint32_t*)(src+1*stride))[0]=
2072     ((uint32_t*)(src+2*stride))[0]=
2073     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2074 }
2075
2076 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2077     ((uint32_t*)(src+0*stride))[0]=
2078     ((uint32_t*)(src+1*stride))[0]=
2079     ((uint32_t*)(src+2*stride))[0]=
2080     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2081 }
2082
2083
2084 #define LOAD_TOP_RIGHT_EDGE\
2085     const int t4= topright[0];\
2086     const int t5= topright[1];\
2087     const int t6= topright[2];\
2088     const int t7= topright[3];\
2089
2090 #define LOAD_LEFT_EDGE\
2091     const int l0= src[-1+0*stride];\
2092     const int l1= src[-1+1*stride];\
2093     const int l2= src[-1+2*stride];\
2094     const int l3= src[-1+3*stride];\
2095
2096 #define LOAD_TOP_EDGE\
2097     const int t0= src[ 0-1*stride];\
2098     const int t1= src[ 1-1*stride];\
2099     const int t2= src[ 2-1*stride];\
2100     const int t3= src[ 3-1*stride];\
2101
2102 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2103     const int lt= src[-1-1*stride];
2104     LOAD_TOP_EDGE
2105     LOAD_LEFT_EDGE
2106
2107     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2108     src[0+2*stride]=
2109     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2110     src[0+1*stride]=
2111     src[1+2*stride]=
2112     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2113     src[0+0*stride]=
2114     src[1+1*stride]=
2115     src[2+2*stride]=
2116     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2117     src[1+0*stride]=
2118     src[2+1*stride]=
2119     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2120     src[2+0*stride]=
2121     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2122     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2123 }
2124
2125 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2126     LOAD_TOP_EDGE
2127     LOAD_TOP_RIGHT_EDGE
2128 //    LOAD_LEFT_EDGE
2129
2130     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2131     src[1+0*stride]=
2132     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2133     src[2+0*stride]=
2134     src[1+1*stride]=
2135     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2136     src[3+0*stride]=
2137     src[2+1*stride]=
2138     src[1+2*stride]=
2139     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2140     src[3+1*stride]=
2141     src[2+2*stride]=
2142     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2143     src[3+2*stride]=
2144     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2145     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2146 }
2147
2148 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2149     const int lt= src[-1-1*stride];
2150     LOAD_TOP_EDGE
2151     LOAD_LEFT_EDGE
2152     const __attribute__((unused)) int unu= l3;
2153
2154     src[0+0*stride]=
2155     src[1+2*stride]=(lt + t0 + 1)>>1;
2156     src[1+0*stride]=
2157     src[2+2*stride]=(t0 + t1 + 1)>>1;
2158     src[2+0*stride]=
2159     src[3+2*stride]=(t1 + t2 + 1)>>1;
2160     src[3+0*stride]=(t2 + t3 + 1)>>1;
2161     src[0+1*stride]=
2162     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2163     src[1+1*stride]=
2164     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2165     src[2+1*stride]=
2166     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2167     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2168     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2169     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2170 }
2171
2172 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2173     LOAD_TOP_EDGE
2174     LOAD_TOP_RIGHT_EDGE
2175     const __attribute__((unused)) int unu= t7;
2176
2177     src[0+0*stride]=(t0 + t1 + 1)>>1;
2178     src[1+0*stride]=
2179     src[0+2*stride]=(t1 + t2 + 1)>>1;
2180     src[2+0*stride]=
2181     src[1+2*stride]=(t2 + t3 + 1)>>1;
2182     src[3+0*stride]=
2183     src[2+2*stride]=(t3 + t4+ 1)>>1;
2184     src[3+2*stride]=(t4 + t5+ 1)>>1;
2185     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2186     src[1+1*stride]=
2187     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2188     src[2+1*stride]=
2189     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2190     src[3+1*stride]=
2191     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2192     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2193 }
2194
2195 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2196     LOAD_LEFT_EDGE
2197
2198     src[0+0*stride]=(l0 + l1 + 1)>>1;
2199     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2200     src[2+0*stride]=
2201     src[0+1*stride]=(l1 + l2 + 1)>>1;
2202     src[3+0*stride]=
2203     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2204     src[2+1*stride]=
2205     src[0+2*stride]=(l2 + l3 + 1)>>1;
2206     src[3+1*stride]=
2207     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2208     src[3+2*stride]=
2209     src[1+3*stride]=
2210     src[0+3*stride]=
2211     src[2+2*stride]=
2212     src[2+3*stride]=
2213     src[3+3*stride]=l3;
2214 }
2215
2216 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2217     const int lt= src[-1-1*stride];
2218     LOAD_TOP_EDGE
2219     LOAD_LEFT_EDGE
2220     const __attribute__((unused)) int unu= t3;
2221
2222     src[0+0*stride]=
2223     src[2+1*stride]=(lt + l0 + 1)>>1;
2224     src[1+0*stride]=
2225     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2226     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2227     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2228     src[0+1*stride]=
2229     src[2+2*stride]=(l0 + l1 + 1)>>1;
2230     src[1+1*stride]=
2231     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2232     src[0+2*stride]=
2233     src[2+3*stride]=(l1 + l2+ 1)>>1;
2234     src[1+2*stride]=
2235     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2236     src[0+3*stride]=(l2 + l3 + 1)>>1;
2237     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2238 }
2239
2240 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2241     int i;
2242     const uint32_t a= ((uint32_t*)(src-stride))[0];
2243     const uint32_t b= ((uint32_t*)(src-stride))[1];
2244     const uint32_t c= ((uint32_t*)(src-stride))[2];
2245     const uint32_t d= ((uint32_t*)(src-stride))[3];
2246
2247     for(i=0; i<16; i++){
2248         ((uint32_t*)(src+i*stride))[0]= a;
2249         ((uint32_t*)(src+i*stride))[1]= b;
2250         ((uint32_t*)(src+i*stride))[2]= c;
2251         ((uint32_t*)(src+i*stride))[3]= d;
2252     }
2253 }
2254
2255 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2256     int i;
2257
2258     for(i=0; i<16; i++){
2259         ((uint32_t*)(src+i*stride))[0]=
2260         ((uint32_t*)(src+i*stride))[1]=
2261         ((uint32_t*)(src+i*stride))[2]=
2262         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2263     }
2264 }
2265
2266 void ff_pred16x16_dc_c(uint8_t *src, int stride){
2267     int i, dc=0;
2268
2269     for(i=0;i<16; i++){
2270         dc+= src[-1+i*stride];
2271     }
2272
2273     for(i=0;i<16; i++){
2274         dc+= src[i-stride];
2275     }
2276
2277     dc= 0x01010101*((dc + 16)>>5);
2278
2279     for(i=0; i<16; i++){
2280         ((uint32_t*)(src+i*stride))[0]=
2281         ((uint32_t*)(src+i*stride))[1]=
2282         ((uint32_t*)(src+i*stride))[2]=
2283         ((uint32_t*)(src+i*stride))[3]= dc;
2284     }
2285 }
2286
2287 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2288     int i, dc=0;
2289
2290     for(i=0;i<16; i++){
2291         dc+= src[-1+i*stride];
2292     }
2293
2294     dc= 0x01010101*((dc + 8)>>4);
2295
2296     for(i=0; i<16; i++){
2297         ((uint32_t*)(src+i*stride))[0]=
2298         ((uint32_t*)(src+i*stride))[1]=
2299         ((uint32_t*)(src+i*stride))[2]=
2300         ((uint32_t*)(src+i*stride))[3]= dc;
2301     }
2302 }
2303
2304 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2305     int i, dc=0;
2306
2307     for(i=0;i<16; i++){
2308         dc+= src[i-stride];
2309     }
2310     dc= 0x01010101*((dc + 8)>>4);
2311
2312     for(i=0; i<16; i++){
2313         ((uint32_t*)(src+i*stride))[0]=
2314         ((uint32_t*)(src+i*stride))[1]=
2315         ((uint32_t*)(src+i*stride))[2]=
2316         ((uint32_t*)(src+i*stride))[3]= dc;
2317     }
2318 }
2319
2320 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2321     int i;
2322
2323     for(i=0; i<16; i++){
2324         ((uint32_t*)(src+i*stride))[0]=
2325         ((uint32_t*)(src+i*stride))[1]=
2326         ((uint32_t*)(src+i*stride))[2]=
2327         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2328     }
2329 }
2330
2331 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2332   int i, j, k;
2333   int a;
2334   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2335   const uint8_t * const src0 = src+7-stride;
2336   const uint8_t *src1 = src+8*stride-1;
2337   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2338   int H = src0[1] - src0[-1];
2339   int V = src1[0] - src2[ 0];
2340   for(k=2; k<=8; ++k) {
2341     src1 += stride; src2 -= stride;
2342     H += k*(src0[k] - src0[-k]);
2343     V += k*(src1[0] - src2[ 0]);
2344   }
2345   if(svq3){
2346     H = ( 5*(H/4) ) / 16;
2347     V = ( 5*(V/4) ) / 16;
2348
2349     /* required for 100% accuracy */
2350     i = H; H = V; V = i;
2351   }else{
2352     H = ( 5*H+32 ) >> 6;
2353     V = ( 5*V+32 ) >> 6;
2354   }
2355
2356   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2357   for(j=16; j>0; --j) {
2358     int b = a;
2359     a += V;
2360     for(i=-16; i<0; i+=4) {
2361       src[16+i] = cm[ (b    ) >> 5 ];
2362       src[17+i] = cm[ (b+  H) >> 5 ];
2363       src[18+i] = cm[ (b+2*H) >> 5 ];
2364       src[19+i] = cm[ (b+3*H) >> 5 ];
2365       b += 4*H;
2366     }
2367     src += stride;
2368   }
2369 }
2370
2371 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2372     pred16x16_plane_compat_c(src, stride, 0);
2373 }
2374
2375 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2376     int i;
2377     const uint32_t a= ((uint32_t*)(src-stride))[0];
2378     const uint32_t b= ((uint32_t*)(src-stride))[1];
2379
2380     for(i=0; i<8; i++){
2381         ((uint32_t*)(src+i*stride))[0]= a;
2382         ((uint32_t*)(src+i*stride))[1]= b;
2383     }
2384 }
2385
2386 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2387     int i;
2388
2389     for(i=0; i<8; i++){
2390         ((uint32_t*)(src+i*stride))[0]=
2391         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2392     }
2393 }
2394
2395 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2396     int i;
2397
2398     for(i=0; i<8; i++){
2399         ((uint32_t*)(src+i*stride))[0]=
2400         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2401     }
2402 }
2403
2404 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2405     int i;
2406     int dc0, dc2;
2407
2408     dc0=dc2=0;
2409     for(i=0;i<4; i++){
2410         dc0+= src[-1+i*stride];
2411         dc2+= src[-1+(i+4)*stride];
2412     }
2413     dc0= 0x01010101*((dc0 + 2)>>2);
2414     dc2= 0x01010101*((dc2 + 2)>>2);
2415
2416     for(i=0; i<4; i++){
2417         ((uint32_t*)(src+i*stride))[0]=
2418         ((uint32_t*)(src+i*stride))[1]= dc0;
2419     }
2420     for(i=4; i<8; i++){
2421         ((uint32_t*)(src+i*stride))[0]=
2422         ((uint32_t*)(src+i*stride))[1]= dc2;
2423     }
2424 }
2425
2426 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2427     int i;
2428     int dc0, dc1;
2429
2430     dc0=dc1=0;
2431     for(i=0;i<4; i++){
2432         dc0+= src[i-stride];
2433         dc1+= src[4+i-stride];
2434     }
2435     dc0= 0x01010101*((dc0 + 2)>>2);
2436     dc1= 0x01010101*((dc1 + 2)>>2);
2437
2438     for(i=0; i<4; i++){
2439         ((uint32_t*)(src+i*stride))[0]= dc0;
2440         ((uint32_t*)(src+i*stride))[1]= dc1;
2441     }
2442     for(i=4; i<8; i++){
2443         ((uint32_t*)(src+i*stride))[0]= dc0;
2444         ((uint32_t*)(src+i*stride))[1]= dc1;
2445     }
2446 }
2447
2448
2449 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2450     int i;
2451     int dc0, dc1, dc2, dc3;
2452
2453     dc0=dc1=dc2=0;
2454     for(i=0;i<4; i++){
2455         dc0+= src[-1+i*stride] + src[i-stride];
2456         dc1+= src[4+i-stride];
2457         dc2+= src[-1+(i+4)*stride];
2458     }
2459     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2460     dc0= 0x01010101*((dc0 + 4)>>3);
2461     dc1= 0x01010101*((dc1 + 2)>>2);
2462     dc2= 0x01010101*((dc2 + 2)>>2);
2463
2464     for(i=0; i<4; i++){
2465         ((uint32_t*)(src+i*stride))[0]= dc0;
2466         ((uint32_t*)(src+i*stride))[1]= dc1;
2467     }
2468     for(i=4; i<8; i++){
2469         ((uint32_t*)(src+i*stride))[0]= dc2;
2470         ((uint32_t*)(src+i*stride))[1]= dc3;
2471     }
2472 }
2473
2474 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2475   int j, k;
2476   int a;
2477   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2478   const uint8_t * const src0 = src+3-stride;
2479   const uint8_t *src1 = src+4*stride-1;
2480   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2481   int H = src0[1] - src0[-1];
2482   int V = src1[0] - src2[ 0];
2483   for(k=2; k<=4; ++k) {
2484     src1 += stride; src2 -= stride;
2485     H += k*(src0[k] - src0[-k]);
2486     V += k*(src1[0] - src2[ 0]);
2487   }
2488   H = ( 17*H+16 ) >> 5;
2489   V = ( 17*V+16 ) >> 5;
2490
2491   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2492   for(j=8; j>0; --j) {
2493     int b = a;
2494     a += V;
2495     src[0] = cm[ (b    ) >> 5 ];
2496     src[1] = cm[ (b+  H) >> 5 ];
2497     src[2] = cm[ (b+2*H) >> 5 ];
2498     src[3] = cm[ (b+3*H) >> 5 ];
2499     src[4] = cm[ (b+4*H) >> 5 ];
2500     src[5] = cm[ (b+5*H) >> 5 ];
2501     src[6] = cm[ (b+6*H) >> 5 ];
2502     src[7] = cm[ (b+7*H) >> 5 ];
2503     src += stride;
2504   }
2505 }
2506
2507 #define SRC(x,y) src[(x)+(y)*stride]
2508 #define PL(y) \
2509     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2510 #define PREDICT_8x8_LOAD_LEFT \
2511     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2512                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2513     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2514     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2515
2516 #define PT(x) \
2517     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2518 #define PREDICT_8x8_LOAD_TOP \
2519     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2520                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2521     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2522     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2523                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2524
2525 #define PTR(x) \
2526     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2527 #define PREDICT_8x8_LOAD_TOPRIGHT \
2528     int t8, t9, t10, t11, t12, t13, t14, t15; \
2529     if(has_topright) { \
2530         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2531         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2532     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2533
2534 #define PREDICT_8x8_LOAD_TOPLEFT \
2535     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2536
2537 #define PREDICT_8x8_DC(v) \
2538     int y; \
2539     for( y = 0; y < 8; y++ ) { \
2540         ((uint32_t*)src)[0] = \
2541         ((uint32_t*)src)[1] = v; \
2542         src += stride; \
2543     }
2544
2545 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2546 {
2547     PREDICT_8x8_DC(0x80808080);
2548 }
2549 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2550 {
2551     PREDICT_8x8_LOAD_LEFT;
2552     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2553     PREDICT_8x8_DC(dc);
2554 }
2555 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2556 {
2557     PREDICT_8x8_LOAD_TOP;
2558     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2559     PREDICT_8x8_DC(dc);
2560 }
2561 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2562 {
2563     PREDICT_8x8_LOAD_LEFT;
2564     PREDICT_8x8_LOAD_TOP;
2565     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2566                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2567     PREDICT_8x8_DC(dc);
2568 }
2569 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2570 {
2571     PREDICT_8x8_LOAD_LEFT;
2572 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2573                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2574     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2575 #undef ROW
2576 }
2577 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2578 {
2579     int y;
2580     PREDICT_8x8_LOAD_TOP;
2581     src[0] = t0;
2582     src[1] = t1;
2583     src[2] = t2;
2584     src[3] = t3;
2585     src[4] = t4;
2586     src[5] = t5;
2587     src[6] = t6;
2588     src[7] = t7;
2589     for( y = 1; y < 8; y++ )
2590         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2591 }
2592 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2593 {
2594     PREDICT_8x8_LOAD_TOP;
2595     PREDICT_8x8_LOAD_TOPRIGHT;
2596     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2597     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2598     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2599     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2600     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2601     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2602     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2603     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2604     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2605     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2606     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2607     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2608     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2609     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2610     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2611 }
2612 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2613 {
2614     PREDICT_8x8_LOAD_TOP;
2615     PREDICT_8x8_LOAD_LEFT;
2616     PREDICT_8x8_LOAD_TOPLEFT;
2617     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2618     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2619     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2620     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2621     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2622     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2623     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2624     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2625     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2626     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2627     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2628     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2629     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2630     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2631     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2632
2633 }
2634 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2635 {
2636     PREDICT_8x8_LOAD_TOP;
2637     PREDICT_8x8_LOAD_LEFT;
2638     PREDICT_8x8_LOAD_TOPLEFT;
2639     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2640     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2641     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2642     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2643     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2644     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2645     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2646     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2647     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2648     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2649     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2650     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2651     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2652     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2653     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2654     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2655     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2656     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2657     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2658     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2659     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2660     SRC(7,0)= (t6 + t7 + 1) >> 1;
2661 }
2662 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2663 {
2664     PREDICT_8x8_LOAD_TOP;
2665     PREDICT_8x8_LOAD_LEFT;
2666     PREDICT_8x8_LOAD_TOPLEFT;
2667     SRC(0,7)= (l6 + l7 + 1) >> 1;
2668     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2669     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2670     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2671     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2672     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2673     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2674     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2675     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2676     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2677     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2678     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2679     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2680     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2681     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2682     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2683     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2684     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2685     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2686     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2687     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2688     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2689 }
2690 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2691 {
2692     PREDICT_8x8_LOAD_TOP;
2693     PREDICT_8x8_LOAD_TOPRIGHT;
2694     SRC(0,0)= (t0 + t1 + 1) >> 1;
2695     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2696     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2697     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2698     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2699     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2700     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2701     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2702     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2703     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2704     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2705     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2706     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2707     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2708     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2709     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2710     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2711     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2712     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2713     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2714     SRC(7,6)= (t10 + t11 + 1) >> 1;
2715     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2716 }
2717 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2718 {
2719     PREDICT_8x8_LOAD_LEFT;
2720     SRC(0,0)= (l0 + l1 + 1) >> 1;
2721     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2722     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2723     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2724     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2725     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2726     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2727     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2728     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2729     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2730     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2731     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2732     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2733     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2734     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2735     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2736     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2737     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2738 }
2739 #undef PREDICT_8x8_LOAD_LEFT
2740 #undef PREDICT_8x8_LOAD_TOP
2741 #undef PREDICT_8x8_LOAD_TOPLEFT
2742 #undef PREDICT_8x8_LOAD_TOPRIGHT
2743 #undef PREDICT_8x8_DC
2744 #undef PTR
2745 #undef PT
2746 #undef PL
2747 #undef SRC
2748
2749 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2750                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2751                            int src_x_offset, int src_y_offset,
2752                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2753     MpegEncContext * const s = &h->s;
2754     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2755     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2756     const int luma_xy= (mx&3) + ((my&3)<<2);
2757     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2758     uint8_t * src_cb, * src_cr;
2759     int extra_width= h->emu_edge_width;
2760     int extra_height= h->emu_edge_height;
2761     int emu=0;
2762     const int full_mx= mx>>2;
2763     const int full_my= my>>2;
2764     const int pic_width  = 16*s->mb_width;
2765     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2766
2767     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2768         return;
2769
2770     if(mx&7) extra_width -= 3;
2771     if(my&7) extra_height -= 3;
2772
2773     if(   full_mx < 0-extra_width
2774        || full_my < 0-extra_height
2775        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2776        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2777         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2778             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2779         emu=1;
2780     }
2781
2782     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2783     if(!square){
2784         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2785     }
2786
2787     if(s->flags&CODEC_FLAG_GRAY) return;
2788
2789     if(MB_MBAFF){
2790         // chroma offset when predicting from a field of opposite parity
2791         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2792         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2793     }
2794     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2795     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2796
2797     if(emu){
2798         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2799             src_cb= s->edge_emu_buffer;
2800     }
2801     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2802
2803     if(emu){
2804         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2805             src_cr= s->edge_emu_buffer;
2806     }
2807     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2808 }
2809
2810 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2811                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2812                            int x_offset, int y_offset,
2813                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2814                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2815                            int list0, int list1){
2816     MpegEncContext * const s = &h->s;
2817     qpel_mc_func *qpix_op=  qpix_put;
2818     h264_chroma_mc_func chroma_op= chroma_put;
2819
2820     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2821     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2822     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2823     x_offset += 8*s->mb_x;
2824     y_offset += 8*(s->mb_y >> MB_MBAFF);
2825
2826     if(list0){
2827         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2828         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2829                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2830                            qpix_op, chroma_op);
2831
2832         qpix_op=  qpix_avg;
2833         chroma_op= chroma_avg;
2834     }
2835
2836     if(list1){
2837         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2838         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2839                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2840                            qpix_op, chroma_op);
2841     }
2842 }
2843
2844 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2845                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2846                            int x_offset, int y_offset,
2847                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2848                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2849                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2850                            int list0, int list1){
2851     MpegEncContext * const s = &h->s;
2852
2853     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2854     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2855     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2856     x_offset += 8*s->mb_x;
2857     y_offset += 8*(s->mb_y >> MB_MBAFF);
2858
2859     if(list0 && list1){
2860         /* don't optimize for luma-only case, since B-frames usually
2861          * use implicit weights => chroma too. */
2862         uint8_t *tmp_cb = s->obmc_scratchpad;
2863         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2864         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2865         int refn0 = h->ref_cache[0][ scan8[n] ];
2866         int refn1 = h->ref_cache[1][ scan8[n] ];
2867
2868         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2869                     dest_y, dest_cb, dest_cr,
2870                     x_offset, y_offset, qpix_put, chroma_put);
2871         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2872                     tmp_y, tmp_cb, tmp_cr,
2873                     x_offset, y_offset, qpix_put, chroma_put);
2874
2875         if(h->use_weight == 2){
2876             int weight0 = h->implicit_weight[refn0][refn1];
2877             int weight1 = 64 - weight0;
2878             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2879             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2880             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2881         }else{
2882             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2883                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2884                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2885             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2886                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2887                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2888             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2889                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2890                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2891         }
2892     }else{
2893         int list = list1 ? 1 : 0;
2894         int refn = h->ref_cache[list][ scan8[n] ];
2895         Picture *ref= &h->ref_list[list][refn];
2896         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2897                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2898                     qpix_put, chroma_put);
2899
2900         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2901                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2902         if(h->use_weight_chroma){
2903             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2904                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2905             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2906                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2907         }
2908     }
2909 }
2910
2911 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2912                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2913                            int x_offset, int y_offset,
2914                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2915                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2916                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2917                            int list0, int list1){
2918     if((h->use_weight==2 && list0 && list1
2919         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2920        || h->use_weight==1)
2921         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2922                          x_offset, y_offset, qpix_put, chroma_put,
2923                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2924     else
2925         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2926                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2927 }
2928
2929 static inline void prefetch_motion(H264Context *h, int list){
2930     /* fetch pixels for estimated mv 4 macroblocks ahead
2931      * optimized for 64byte cache lines */
2932     MpegEncContext * const s = &h->s;
2933     const int refn = h->ref_cache[list][scan8[0]];
2934     if(refn >= 0){
2935         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2936         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2937         uint8_t **src= h->ref_list[list][refn].data;
2938         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2939         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2940         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2941         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2942     }
2943 }
2944
2945 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2946                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2947                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2948                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2949     MpegEncContext * const s = &h->s;
2950     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2951     const int mb_type= s->current_picture.mb_type[mb_xy];
2952
2953     assert(IS_INTER(mb_type));
2954
2955     prefetch_motion(h, 0);
2956
2957     if(IS_16X16(mb_type)){
2958         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2959                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2960                 &weight_op[0], &weight_avg[0],
2961                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2962     }else if(IS_16X8(mb_type)){
2963         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2964                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2965                 &weight_op[1], &weight_avg[1],
2966                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2967         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2968                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2969                 &weight_op[1], &weight_avg[1],
2970                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2971     }else if(IS_8X16(mb_type)){
2972         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2973                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2974                 &weight_op[2], &weight_avg[2],
2975                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2976         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2977                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2978                 &weight_op[2], &weight_avg[2],
2979                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2980     }else{
2981         int i;
2982
2983         assert(IS_8X8(mb_type));
2984
2985         for(i=0; i<4; i++){
2986             const int sub_mb_type= h->sub_mb_type[i];
2987             const int n= 4*i;
2988             int x_offset= (i&1)<<2;
2989             int y_offset= (i&2)<<1;
2990
2991             if(IS_SUB_8X8(sub_mb_type)){
2992                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2993                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2994                     &weight_op[3], &weight_avg[3],
2995                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2996             }else if(IS_SUB_8X4(sub_mb_type)){
2997                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2998                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2999                     &weight_op[4], &weight_avg[4],
3000                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3001                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3002                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3003                     &weight_op[4], &weight_avg[4],
3004                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3005             }else if(IS_SUB_4X8(sub_mb_type)){
3006                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3007                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3008                     &weight_op[5], &weight_avg[5],
3009                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3010                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3011                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3012                     &weight_op[5], &weight_avg[5],
3013                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3014             }else{
3015                 int j;
3016                 assert(IS_SUB_4X4(sub_mb_type));
3017                 for(j=0; j<4; j++){
3018                     int sub_x_offset= x_offset + 2*(j&1);
3019                     int sub_y_offset= y_offset +   (j&2);
3020                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3021                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3022                         &weight_op[6], &weight_avg[6],
3023                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3024                 }
3025             }
3026         }
3027     }
3028
3029     prefetch_motion(h, 1);
3030 }
3031
3032 static void decode_init_vlc(void){
3033     static int done = 0;
3034
3035     if (!done) {
3036         int i;
3037         done = 1;
3038
3039         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3040                  &chroma_dc_coeff_token_len [0], 1, 1,
3041                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3042
3043         for(i=0; i<4; i++){
3044             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3045                      &coeff_token_len [i][0], 1, 1,
3046                      &coeff_token_bits[i][0], 1, 1, 1);
3047         }
3048
3049         for(i=0; i<3; i++){
3050             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3051                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3052                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3053         }
3054         for(i=0; i<15; i++){
3055             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3056                      &total_zeros_len [i][0], 1, 1,
3057                      &total_zeros_bits[i][0], 1, 1, 1);
3058         }
3059
3060         for(i=0; i<6; i++){
3061             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3062                      &run_len [i][0], 1, 1,
3063                      &run_bits[i][0], 1, 1, 1);
3064         }
3065         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3066                  &run_len [6][0], 1, 1,
3067                  &run_bits[6][0], 1, 1, 1);
3068     }
3069 }
3070
3071 /**
3072  * Sets the intra prediction function pointers.
3073  */
3074 static void init_pred_ptrs(H264Context *h){
3075 //    MpegEncContext * const s = &h->s;
3076
3077     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3078     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3079     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3080     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3081     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3082     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3083     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3084     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3085     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3086     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3087     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3088     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3089
3090     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3091     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3092     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3093     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3094     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3095     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3096     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3097     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3098     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3099     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3100     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3101     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3102
3103     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
3104     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
3105     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
3106     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
3107     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3108     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3109     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
3110
3111     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
3112     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
3113     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
3114     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
3115     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3116     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3117     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
3118 }
3119
3120 static void free_tables(H264Context *h){
3121     av_freep(&h->intra4x4_pred_mode);
3122     av_freep(&h->chroma_pred_mode_table);
3123     av_freep(&h->cbp_table);
3124     av_freep(&h->mvd_table[0]);
3125     av_freep(&h->mvd_table[1]);
3126     av_freep(&h->direct_table);
3127     av_freep(&h->non_zero_count);
3128     av_freep(&h->slice_table_base);
3129     av_freep(&h->top_borders[1]);
3130     av_freep(&h->top_borders[0]);
3131     h->slice_table= NULL;
3132
3133     av_freep(&h->mb2b_xy);
3134     av_freep(&h->mb2b8_xy);
3135
3136     av_freep(&h->s.obmc_scratchpad);
3137 }
3138
3139 static void init_dequant8_coeff_table(H264Context *h){
3140     int i,q,x;
3141     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3142     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3143     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3144
3145     for(i=0; i<2; i++ ){
3146         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3147             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3148             break;
3149         }
3150
3151         for(q=0; q<52; q++){
3152             int shift = ff_div6[q];
3153             int idx = ff_rem6[q];
3154             for(x=0; x<64; x++)
3155                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3156                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3157                     h->pps.scaling_matrix8[i][x]) << shift;
3158         }
3159     }
3160 }
3161
3162 static void init_dequant4_coeff_table(H264Context *h){
3163     int i,j,q,x;
3164     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3165     for(i=0; i<6; i++ ){
3166         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3167         for(j=0; j<i; j++){
3168             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3169                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3170                 break;
3171             }
3172         }
3173         if(j<i)
3174             continue;
3175
3176         for(q=0; q<52; q++){
3177             int shift = ff_div6[q] + 2;
3178             int idx = ff_rem6[q];
3179             for(x=0; x<16; x++)
3180                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3181                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3182                     h->pps.scaling_matrix4[i][x]) << shift;
3183         }
3184     }
3185 }
3186
3187 static void init_dequant_tables(H264Context *h){
3188     int i,x;
3189     init_dequant4_coeff_table(h);
3190     if(h->pps.transform_8x8_mode)
3191         init_dequant8_coeff_table(h);
3192     if(h->sps.transform_bypass){
3193         for(i=0; i<6; i++)
3194             for(x=0; x<16; x++)
3195                 h->dequant4_coeff[i][0][x] = 1<<6;
3196         if(h->pps.transform_8x8_mode)
3197             for(i=0; i<2; i++)
3198                 for(x=0; x<64; x++)
3199                     h->dequant8_coeff[i][0][x] = 1<<6;
3200     }
3201 }
3202
3203
3204 /**
3205  * allocates tables.
3206  * needs width/height
3207  */
3208 static int alloc_tables(H264Context *h){
3209     MpegEncContext * const s = &h->s;
3210     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3211     int x,y;
3212
3213     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3214
3215     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3216     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3217     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3218     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3219     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3220
3221     if( h->pps.cabac ) {
3222         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3223         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3224         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3225         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3226     }
3227
3228     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3229     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3230
3231     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3232     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3233     for(y=0; y<s->mb_height; y++){
3234         for(x=0; x<s->mb_width; x++){
3235             const int mb_xy= x + y*s->mb_stride;
3236             const int b_xy = 4*x + 4*y*h->b_stride;
3237             const int b8_xy= 2*x + 2*y*h->b8_stride;
3238
3239             h->mb2b_xy [mb_xy]= b_xy;
3240             h->mb2b8_xy[mb_xy]= b8_xy;
3241         }
3242     }
3243
3244     s->obmc_scratchpad = NULL;
3245
3246     if(!h->dequant4_coeff[0])
3247         init_dequant_tables(h);
3248
3249     return 0;
3250 fail:
3251     free_tables(h);
3252     return -1;
3253 }
3254
3255 static void common_init(H264Context *h){
3256     MpegEncContext * const s = &h->s;
3257
3258     s->width = s->avctx->width;
3259     s->height = s->avctx->height;
3260     s->codec_id= s->avctx->codec->id;
3261
3262     init_pred_ptrs(h);
3263
3264     h->dequant_coeff_pps= -1;
3265     s->unrestricted_mv=1;
3266     s->decode=1; //FIXME
3267
3268     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3269     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3270 }
3271
3272 static int decode_init(AVCodecContext *avctx){
3273     H264Context *h= avctx->priv_data;
3274     MpegEncContext * const s = &h->s;
3275
3276     MPV_decode_defaults(s);
3277
3278     s->avctx = avctx;
3279     common_init(h);
3280
3281     s->out_format = FMT_H264;
3282     s->workaround_bugs= avctx->workaround_bugs;
3283
3284     // set defaults
3285 //    s->decode_mb= ff_h263_decode_mb;
3286     s->low_delay= 1;
3287     avctx->pix_fmt= PIX_FMT_YUV420P;
3288
3289     decode_init_vlc();
3290
3291     if(avctx->extradata_size > 0 && avctx->extradata &&
3292        *(char *)avctx->extradata == 1){
3293         h->is_avc = 1;
3294         h->got_avcC = 0;
3295     } else {
3296         h->is_avc = 0;
3297     }
3298
3299     return 0;
3300 }
3301
3302 static int frame_start(H264Context *h){
3303     MpegEncContext * const s = &h->s;
3304     int i;
3305
3306     if(MPV_frame_start(s, s->avctx) < 0)
3307         return -1;
3308     ff_er_frame_start(s);
3309
3310     assert(s->linesize && s->uvlinesize);
3311
3312     for(i=0; i<16; i++){
3313         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3314         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3315     }
3316     for(i=0; i<4; i++){
3317         h->block_offset[16+i]=
3318         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3319         h->block_offset[24+16+i]=
3320         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3321     }
3322
3323     /* can't be in alloc_tables because linesize isn't known there.
3324      * FIXME: redo bipred weight to not require extra buffer? */
3325     if(!s->obmc_scratchpad)
3326         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3327
3328     /* some macroblocks will be accessed before they're available */
3329     if(FRAME_MBAFF)
3330         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3331
3332 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3333     return 0;
3334 }
3335
3336 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3337     MpegEncContext * const s = &h->s;
3338     int i;
3339
3340     src_y  -=   linesize;
3341     src_cb -= uvlinesize;
3342     src_cr -= uvlinesize;
3343
3344     // There are two lines saved, the line above the the top macroblock of a pair,
3345     // and the line above the bottom macroblock
3346     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3347     for(i=1; i<17; i++){
3348         h->left_border[i]= src_y[15+i*  linesize];
3349     }
3350
3351     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3352     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3353
3354     if(!(s->flags&CODEC_FLAG_GRAY)){
3355         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3356         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3357         for(i=1; i<9; i++){
3358             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3359             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3360         }
3361         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3362         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3363     }
3364 }
3365
3366 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3367     MpegEncContext * const s = &h->s;
3368     int temp8, i;
3369     uint64_t temp64;
3370     int deblock_left = (s->mb_x > 0);
3371     int deblock_top  = (s->mb_y > 0);
3372
3373     src_y  -=   linesize + 1;
3374     src_cb -= uvlinesize + 1;
3375     src_cr -= uvlinesize + 1;
3376
3377 #define XCHG(a,b,t,xchg)\
3378 t= a;\
3379 if(xchg)\
3380     a= b;\
3381 b= t;
3382
3383     if(deblock_left){
3384         for(i = !deblock_top; i<17; i++){
3385             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3386         }
3387     }
3388
3389     if(deblock_top){
3390         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3391         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3392         if(s->mb_x+1 < s->mb_width){
3393             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3394         }
3395     }
3396
3397     if(!(s->flags&CODEC_FLAG_GRAY)){
3398         if(deblock_left){
3399             for(i = !deblock_top; i<9; i++){
3400                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3401                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3402             }
3403         }
3404         if(deblock_top){
3405             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3406             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3407         }
3408     }
3409 }
3410
3411 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3412     MpegEncContext * const s = &h->s;
3413     int i;
3414
3415     src_y  -= 2 *   linesize;
3416     src_cb -= 2 * uvlinesize;
3417     src_cr -= 2 * uvlinesize;
3418
3419     // There are two lines saved, the line above the the top macroblock of a pair,
3420     // and the line above the bottom macroblock
3421     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3422     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3423     for(i=2; i<34; i++){
3424         h->left_border[i]= src_y[15+i*  linesize];
3425     }
3426
3427     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3428     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3429     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3430     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3431
3432     if(!(s->flags&CODEC_FLAG_GRAY)){
3433         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3434         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3435         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3436         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3437         for(i=2; i<18; i++){
3438             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3439             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3440         }
3441         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3442         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3443         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3444         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3445     }
3446 }
3447
3448 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3449     MpegEncContext * const s = &h->s;
3450     int temp8, i;
3451     uint64_t temp64;
3452     int deblock_left = (s->mb_x > 0);
3453     int deblock_top  = (s->mb_y > 1);
3454
3455     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3456
3457     src_y  -= 2 *   linesize + 1;
3458     src_cb -= 2 * uvlinesize + 1;
3459     src_cr -= 2 * uvlinesize + 1;
3460
3461 #define XCHG(a,b,t,xchg)\
3462 t= a;\
3463 if(xchg)\
3464     a= b;\
3465 b= t;
3466
3467     if(deblock_left){
3468         for(i = (!deblock_top)<<1; i<34; i++){
3469             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3470         }
3471     }
3472
3473     if(deblock_top){
3474         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3475         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3476         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3477         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3478         if(s->mb_x+1 < s->mb_width){
3479             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3480             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3481         }
3482     }
3483
3484     if(!(s->flags&CODEC_FLAG_GRAY)){
3485         if(deblock_left){
3486             for(i = (!deblock_top) << 1; i<18; i++){
3487                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3488                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3489             }
3490         }
3491         if(deblock_top){
3492             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3493             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3494             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3495             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3496         }
3497     }
3498 }
3499
3500 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
3501     MpegEncContext * const s = &h->s;
3502     const int mb_x= s->mb_x;
3503     const int mb_y= s->mb_y;
3504     const int mb_xy= mb_x + mb_y*s->mb_stride;
3505     const int mb_type= s->current_picture.mb_type[mb_xy];
3506     uint8_t  *dest_y, *dest_cb, *dest_cr;
3507     int linesize, uvlinesize /*dct_offset*/;
3508     int i;
3509     int *block_offset = &h->block_offset[0];
3510     const unsigned int bottom = mb_y & 1;
3511     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3512     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3513     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3514
3515     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3516     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3517     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3518
3519     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3520     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3521
3522     if (!simple && MB_FIELD) {
3523         linesize   = h->mb_linesize   = s->linesize * 2;
3524         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3525         block_offset = &h->block_offset[24];
3526         if(mb_y&1){ //FIXME move out of this func?
3527             dest_y -= s->linesize*15;
3528             dest_cb-= s->uvlinesize*7;
3529             dest_cr-= s->uvlinesize*7;
3530         }
3531         if(FRAME_MBAFF) {
3532             int list;
3533             for(list=0; list<h->list_count; list++){
3534                 if(!USES_LIST(mb_type, list))
3535                     continue;
3536                 if(IS_16X16(mb_type)){
3537                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3538                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3539                 }else{
3540                     for(i=0; i<16; i+=4){
3541                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3542                         int ref = h->ref_cache[list][scan8[i]];
3543                         if(ref >= 0)
3544                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3545                     }
3546                 }
3547             }
3548         }
3549     } else {
3550         linesize   = h->mb_linesize   = s->linesize;
3551         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3552 //        dct_offset = s->linesize * 16;
3553     }
3554
3555     if(transform_bypass){
3556         idct_dc_add =
3557         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3558     }else if(IS_8x8DCT(mb_type)){
3559         idct_dc_add = s->dsp.h264_idct8_dc_add;
3560         idct_add = s->dsp.h264_idct8_add;
3561     }else{
3562         idct_dc_add = s->dsp.h264_idct_dc_add;
3563         idct_add = s->dsp.h264_idct_add;
3564     }
3565
3566     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3567        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3568         int mbt_y = mb_y&~1;
3569         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3570         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3571         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3572         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3573     }
3574
3575     if (!simple && IS_INTRA_PCM(mb_type)) {
3576         unsigned int x, y;
3577
3578         // The pixels are stored in h->mb array in the same order as levels,
3579         // copy them in output in the correct order.
3580         for(i=0; i<16; i++) {
3581             for (y=0; y<4; y++) {
3582                 for (x=0; x<4; x++) {
3583                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3584                 }
3585             }
3586         }
3587         for(i=16; i<16+4; i++) {
3588             for (y=0; y<4; y++) {
3589                 for (x=0; x<4; x++) {
3590                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3591                 }
3592             }
3593         }
3594         for(i=20; i<20+4; i++) {
3595             for (y=0; y<4; y++) {
3596                 for (x=0; x<4; x++) {
3597                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3598                 }
3599             }
3600         }
3601     } else {
3602         if(IS_INTRA(mb_type)){
3603             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3604                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3605
3606             if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3607                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3608                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3609             }
3610
3611             if(IS_INTRA4x4(mb_type)){
3612                 if(simple || !s->encoding){
3613                     if(IS_8x8DCT(mb_type)){
3614                         for(i=0; i<16; i+=4){
3615                             uint8_t * const ptr= dest_y + block_offset[i];
3616                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3617                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3618                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3619                                                    (h->topright_samples_available<<i)&0x4000, linesize);
3620                             if(nnz){
3621                                 if(nnz == 1 && h->mb[i*16])
3622                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3623                                 else
3624                                     idct_add(ptr, h->mb + i*16, linesize);
3625                             }
3626                         }
3627                     }else
3628                     for(i=0; i<16; i++){
3629                         uint8_t * const ptr= dest_y + block_offset[i];
3630                         uint8_t *topright;
3631                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3632                         int nnz, tr;
3633
3634                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3635                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3636                             assert(mb_y || linesize <= block_offset[i]);
3637                             if(!topright_avail){
3638                                 tr= ptr[3 - linesize]*0x01010101;
3639                                 topright= (uint8_t*) &tr;
3640                             }else
3641                                 topright= ptr + 4 - linesize;
3642                         }else
3643                             topright= NULL;
3644
3645                         h->pred4x4[ dir ](ptr, topright, linesize);
3646                         nnz = h->non_zero_count_cache[ scan8[i] ];
3647                         if(nnz){
3648                             if(is_h264){
3649                                 if(nnz == 1 && h->mb[i*16])
3650                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3651                                 else
3652                                     idct_add(ptr, h->mb + i*16, linesize);
3653                             }else
3654                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3655                         }
3656                     }
3657                 }
3658             }else{
3659                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3660                 if(is_h264){
3661                     if(!transform_bypass)
3662                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3663                 }else
3664                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3665             }
3666             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3667                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3668         }else if(is_h264){
3669             hl_motion(h, dest_y, dest_cb, dest_cr,
3670                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3671                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3672                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3673         }
3674
3675
3676         if(!IS_INTRA4x4(mb_type)){
3677             if(is_h264){
3678                 if(IS_INTRA16x16(mb_type)){
3679                     for(i=0; i<16; i++){
3680                         if(h->non_zero_count_cache[ scan8[i] ])
3681                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3682                         else if(h->mb[i*16])
3683                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3684                     }
3685                 }else{
3686                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3687                     for(i=0; i<16; i+=di){
3688                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3689                         if(nnz){
3690                             if(nnz==1 && h->mb[i*16])
3691                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3692                             else
3693                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3694                         }
3695                     }
3696                 }
3697             }else{
3698                 for(i=0; i<16; i++){
3699                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3700                         uint8_t * const ptr= dest_y + block_offset[i];
3701                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3702                     }
3703                 }
3704             }
3705         }
3706
3707         if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3708             uint8_t *dest[2] = {dest_cb, dest_cr};
3709             if(transform_bypass){
3710                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3711             }else{
3712                 idct_add = s->dsp.h264_idct_add;
3713                 idct_dc_add = s->dsp.h264_idct_dc_add;
3714                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3715                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3716             }
3717             if(is_h264){
3718                 for(i=16; i<16+8; i++){
3719                     if(h->non_zero_count_cache[ scan8[i] ])
3720                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3721                     else if(h->mb[i*16])
3722                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3723                 }
3724             }else{
3725                 for(i=16; i<16+8; i++){
3726                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3727                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3728                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3729                     }
3730                 }
3731             }
3732         }
3733     }
3734     if(h->deblocking_filter) {
3735         if (!simple && FRAME_MBAFF) {
3736             //FIXME try deblocking one mb at a time?
3737             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3738             const int mb_y = s->mb_y - 1;
3739             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3740             const int mb_xy= mb_x + mb_y*s->mb_stride;
3741             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3742             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3743             if (!bottom) return;
3744             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3745             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3746             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3747
3748             if(IS_INTRA(mb_type_top | mb_type_bottom))
3749                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3750
3751             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3752             // deblock a pair
3753             // top
3754             s->mb_y--;
3755             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3756             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3757             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3758             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3759             // bottom
3760             s->mb_y++;
3761             tprintf(h->s.avctx, "call mbaff filter_mb\n");
3762             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3763             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3764             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3765         } else {
3766             tprintf(h->s.avctx, "call filter_mb\n");
3767             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3768             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3769             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3770         }
3771     }
3772 }
3773
3774 /**
3775  * Process a macroblock; this case avoids checks for expensive uncommon cases.
3776  */
3777 static void hl_decode_mb_simple(H264Context *h){
3778     hl_decode_mb_internal(h, 1);
3779 }
3780
3781 /**
3782  * Process a macroblock; this handles edge cases, such as interlacing.
3783  */
3784 static void av_noinline hl_decode_mb_complex(H264Context *h){
3785     hl_decode_mb_internal(h, 0);
3786 }
3787
3788 static void hl_decode_mb(H264Context *h){
3789     MpegEncContext * const s = &h->s;
3790     const int mb_x= s->mb_x;
3791     const int mb_y= s->mb_y;
3792     const int mb_xy= mb_x + mb_y*s->mb_stride;
3793     const int mb_type= s->current_picture.mb_type[mb_xy];
3794     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
3795
3796     if(!s->decode)
3797         return;
3798
3799     if (is_complex)
3800         hl_decode_mb_complex(h);
3801     else hl_decode_mb_simple(h);
3802 }
3803
3804 /**
3805  * fills the default_ref_list.
3806  */
3807 static int fill_default_ref_list(H264Context *h){
3808     MpegEncContext * const s = &h->s;
3809     int i;
3810     int smallest_poc_greater_than_current = -1;
3811     Picture sorted_short_ref[32];
3812
3813     if(h->slice_type==B_TYPE){
3814         int out_i;
3815         int limit= INT_MIN;
3816
3817         /* sort frame according to poc in B slice */
3818         for(out_i=0; out_i<h->short_ref_count; out_i++){
3819             int best_i=INT_MIN;
3820             int best_poc=INT_MAX;
3821
3822             for(i=0; i<h->short_ref_count; i++){
3823                 const int poc= h->short_ref[i]->poc;
3824                 if(poc > limit && poc < best_poc){
3825                     best_poc= poc;
3826                     best_i= i;
3827                 }
3828             }
3829
3830             assert(best_i != INT_MIN);
3831
3832             limit= best_poc;
3833             sorted_short_ref[out_i]= *h->short_ref[best_i];
3834             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3835             if (-1 == smallest_poc_greater_than_current) {
3836                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3837                     smallest_poc_greater_than_current = out_i;
3838                 }
3839             }
3840         }
3841     }
3842
3843     if(s->picture_structure == PICT_FRAME){
3844         if(h->slice_type==B_TYPE){
3845             int list;
3846             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3847
3848             // find the largest poc
3849             for(list=0; list<2; list++){
3850                 int index = 0;
3851                 int j= -99;
3852                 int step= list ? -1 : 1;
3853
3854                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3855                     while(j<0 || j>= h->short_ref_count){
3856                         if(j != -99 && step == (list ? -1 : 1))
3857                             return -1;
3858                         step = -step;
3859                         j= smallest_poc_greater_than_current + (step>>1);
3860                     }
3861                     if(sorted_short_ref[j].reference != 3) continue;
3862                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3863                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3864                 }
3865
3866                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3867                     if(h->long_ref[i] == NULL) continue;
3868                     if(h->long_ref[i]->reference != 3) continue;
3869
3870                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3871                     h->default_ref_list[ list ][index++].pic_id= i;;
3872                 }
3873
3874                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3875                     // swap the two first elements of L1 when
3876                     // L0 and L1 are identical
3877                     Picture temp= h->default_ref_list[1][0];
3878                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3879                     h->default_ref_list[1][1] = temp;
3880                 }
3881
3882                 if(index < h->ref_count[ list ])
3883                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3884             }
3885         }else{
3886             int index=0;
3887             for(i=0; i<h->short_ref_count; i++){
3888                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3889                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3890                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3891             }
3892             for(i = 0; i < 16; i++){
3893                 if(h->long_ref[i] == NULL) continue;
3894                 if(h->long_ref[i]->reference != 3) continue;
3895                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3896                 h->default_ref_list[0][index++].pic_id= i;;
3897             }
3898             if(index < h->ref_count[0])
3899                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3900         }
3901     }else{ //FIELD
3902         if(h->slice_type==B_TYPE){
3903         }else{
3904             //FIXME second field balh
3905         }
3906     }
3907 #ifdef TRACE
3908     for (i=0; i<h->ref_count[0]; i++) {
3909         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3910     }
3911     if(h->slice_type==B_TYPE){
3912         for (i=0; i<h->ref_count[1]; i++) {
3913             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3914         }
3915     }
3916 #endif
3917     return 0;
3918 }
3919
3920 static void print_short_term(H264Context *h);
3921 static void print_long_term(H264Context *h);
3922
3923 static int decode_ref_pic_list_reordering(H264Context *h){
3924     MpegEncContext * const s = &h->s;
3925     int list, index;
3926
3927     print_short_term(h);
3928     print_long_term(h);
3929     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3930
3931     for(list=0; list<h->list_count; list++){
3932         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3933
3934         if(get_bits1(&s->gb)){
3935             int pred= h->curr_pic_num;
3936
3937             for(index=0; ; index++){
3938                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3939                 unsigned int pic_id;
3940                 int i;
3941                 Picture *ref = NULL;
3942
3943                 if(reordering_of_pic_nums_idc==3)
3944                     break;
3945
3946                 if(index >= h->ref_count[list]){
3947                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3948                     return -1;
3949                 }
3950
3951                 if(reordering_of_pic_nums_idc<3){
3952                     if(reordering_of_pic_nums_idc<2){
3953                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3954
3955                         if(abs_diff_pic_num >= h->max_pic_num){
3956                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3957                             return -1;
3958                         }
3959
3960                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3961                         else                                pred+= abs_diff_pic_num;
3962                         pred &= h->max_pic_num - 1;
3963
3964                         for(i= h->short_ref_count-1; i>=0; i--){
3965                             ref = h->short_ref[i];
3966                             assert(ref->reference == 3);
3967                             assert(!ref->long_ref);
3968                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3969                                 break;
3970                         }
3971                         if(i>=0)
3972                             ref->pic_id= ref->frame_num;
3973                     }else{
3974                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3975                         if(pic_id>31){
3976                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3977                             return -1;
3978                         }
3979                         ref = h->long_ref[pic_id];
3980                         if(ref){
3981                             ref->pic_id= pic_id;
3982                             assert(ref->reference == 3);
3983                             assert(ref->long_ref);
3984                             i=0;
3985                         }else{
3986                             i=-1;
3987                         }
3988                     }
3989
3990                     if (i < 0) {
3991                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3992                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3993                     } else {
3994                         for(i=index; i+1<h->ref_count[list]; i++){
3995                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3996                                 break;
3997                         }
3998                         for(; i > index; i--){
3999                             h->ref_list[list][i]= h->ref_list[list][i-1];
4000                         }
4001                         h->ref_list[list][index]= *ref;
4002                     }
4003                 }else{
4004                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4005                     return -1;
4006                 }
4007             }
4008         }
4009     }
4010     for(list=0; list<h->list_count; list++){
4011         for(index= 0; index < h->ref_count[list]; index++){
4012             if(!h->ref_list[list][index].data[0])
4013                 h->ref_list[list][index]= s->current_picture;
4014         }
4015     }
4016
4017     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4018         direct_dist_scale_factor(h);
4019     direct_ref_list_init(h);
4020     return 0;
4021 }
4022
4023 static void fill_mbaff_ref_list(H264Context *h){
4024     int list, i, j;
4025     for(list=0; list<2; list++){ //FIXME try list_count
4026         for(i=0; i<h->ref_count[list]; i++){
4027             Picture *frame = &h->ref_list[list][i];
4028             Picture *field = &h->ref_list[list][16+2*i];
4029             field[0] = *frame;
4030             for(j=0; j<3; j++)
4031                 field[0].linesize[j] <<= 1;
4032             field[1] = field[0];
4033             for(j=0; j<3; j++)
4034                 field[1].data[j] += frame->linesize[j];
4035
4036             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4037             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4038             for(j=0; j<2; j++){
4039                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4040                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4041             }
4042         }
4043     }
4044     for(j=0; j<h->ref_count[1]; j++){
4045         for(i=0; i<h->ref_count[0]; i++)
4046             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4047         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4048         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4049     }
4050 }
4051
4052 static int pred_weight_table(H264Context *h){
4053     MpegEncContext * const s = &h->s;
4054     int list, i;
4055     int luma_def, chroma_def;
4056
4057     h->use_weight= 0;
4058     h->use_weight_chroma= 0;
4059     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4060     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4061     luma_def = 1<<h->luma_log2_weight_denom;
4062     chroma_def = 1<<h->chroma_log2_weight_denom;
4063
4064     for(list=0; list<2; list++){
4065         for(i=0; i<h->ref_count[list]; i++){
4066             int luma_weight_flag, chroma_weight_flag;
4067
4068             luma_weight_flag= get_bits1(&s->gb);
4069             if(luma_weight_flag){
4070                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4071                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4072                 if(   h->luma_weight[list][i] != luma_def
4073                    || h->luma_offset[list][i] != 0)
4074                     h->use_weight= 1;
4075             }else{
4076                 h->luma_weight[list][i]= luma_def;
4077                 h->luma_offset[list][i]= 0;
4078             }
4079
4080             chroma_weight_flag= get_bits1(&s->gb);
4081             if(chroma_weight_flag){
4082                 int j;
4083                 for(j=0; j<2; j++){
4084                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4085                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4086                     if(   h->chroma_weight[list][i][j] != chroma_def
4087                        || h->chroma_offset[list][i][j] != 0)
4088                         h->use_weight_chroma= 1;
4089                 }
4090             }else{
4091                 int j;
4092                 for(j=0; j<2; j++){
4093                     h->chroma_weight[list][i][j]= chroma_def;
4094                     h->chroma_offset[list][i][j]= 0;
4095                 }
4096             }
4097         }
4098         if(h->slice_type != B_TYPE) break;
4099     }
4100     h->use_weight= h->use_weight || h->use_weight_chroma;
4101     return 0;
4102 }
4103
4104 static void implicit_weight_table(H264Context *h){
4105     MpegEncContext * const s = &h->s;
4106     int ref0, ref1;
4107     int cur_poc = s->current_picture_ptr->poc;
4108
4109     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4110        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4111         h->use_weight= 0;
4112         h->use_weight_chroma= 0;
4113         return;
4114     }
4115
4116     h->use_weight= 2;
4117     h->use_weight_chroma= 2;
4118     h->luma_log2_weight_denom= 5;
4119     h->chroma_log2_weight_denom= 5;
4120
4121     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4122         int poc0 = h->ref_list[0][ref0].poc;
4123         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4124             int poc1 = h->ref_list[1][ref1].poc;
4125             int td = av_clip(poc1 - poc0, -128, 127);
4126             if(td){
4127                 int tb = av_clip(cur_poc - poc0, -128, 127);
4128                 int tx = (16384 + (FFABS(td) >> 1)) / td;
4129                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4130                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4131                     h->implicit_weight[ref0][ref1] = 32;
4132                 else
4133                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4134             }else
4135                 h->implicit_weight[ref0][ref1] = 32;
4136         }
4137     }
4138 }
4139
4140 static inline void unreference_pic(H264Context *h, Picture *pic){
4141     int i;
4142     pic->reference=0;
4143     if(pic == h->delayed_output_pic)
4144         pic->reference=1;
4145     else{
4146         for(i = 0; h->delayed_pic[i]; i++)
4147             if(pic == h->delayed_pic[i]){
4148                 pic->reference=1;
4149                 break;
4150             }
4151     }
4152 }
4153
4154 /**
4155  * instantaneous decoder refresh.
4156  */
4157 static void idr(H264Context *h){
4158     int i;
4159
4160     for(i=0; i<16; i++){
4161         if (h->long_ref[i] != NULL) {
4162             unreference_pic(h, h->long_ref[i]);
4163             h->long_ref[i]= NULL;
4164         }
4165     }
4166     h->long_ref_count=0;
4167
4168     for(i=0; i<h->short_ref_count; i++){
4169         unreference_pic(h, h->short_ref[i]);
4170         h->short_ref[i]= NULL;
4171     }
4172     h->short_ref_count=0;
4173 }
4174
4175 /* forget old pics after a seek */
4176 static void flush_dpb(AVCodecContext *avctx){
4177     H264Context *h= avctx->priv_data;
4178     int i;
4179     for(i=0; i<16; i++) {
4180         if(h->delayed_pic[i])
4181             h->delayed_pic[i]->reference= 0;
4182         h->delayed_pic[i]= NULL;
4183     }
4184     if(h->delayed_output_pic)
4185         h->delayed_output_pic->reference= 0;
4186     h->delayed_output_pic= NULL;
4187     idr(h);
4188     if(h->s.current_picture_ptr)
4189         h->s.current_picture_ptr->reference= 0;
4190 }
4191
4192 /**
4193  *
4194  * @return the removed picture or NULL if an error occurs
4195  */
4196 static Picture * remove_short(H264Context *h, int frame_num){
4197     MpegEncContext * const s = &h->s;
4198     int i;
4199
4200     if(s->avctx->debug&FF_DEBUG_MMCO)
4201         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4202
4203     for(i=0; i<h->short_ref_count; i++){
4204         Picture *pic= h->short_ref[i];
4205         if(s->avctx->debug&FF_DEBUG_MMCO)
4206             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4207         if(pic->frame_num == frame_num){
4208             h->short_ref[i]= NULL;
4209             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4210             h->short_ref_count--;
4211             return pic;
4212         }
4213     }
4214     return NULL;
4215 }
4216
4217 /**
4218  *
4219  * @return the removed picture or NULL if an error occurs
4220  */
4221 static Picture * remove_long(H264Context *h, int i){
4222     Picture *pic;
4223
4224     pic= h->long_ref[i];
4225     h->long_ref[i]= NULL;
4226     if(pic) h->long_ref_count--;
4227
4228     return pic;
4229 }
4230
4231 /**
4232  * print short term list
4233  */
4234 static void print_short_term(H264Context *h) {
4235     uint32_t i;
4236     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4237         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4238         for(i=0; i<h->short_ref_count; i++){
4239             Picture *pic= h->short_ref[i];
4240             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4241         }
4242     }
4243 }
4244
4245 /**
4246  * print long term list
4247  */
4248 static void print_long_term(H264Context *h) {
4249     uint32_t i;
4250     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4251         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4252         for(i = 0; i < 16; i++){
4253             Picture *pic= h->long_ref[i];
4254             if (pic) {
4255                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4256             }
4257         }
4258     }
4259 }
4260
4261 /**
4262  * Executes the reference picture marking (memory management control operations).
4263  */
4264 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4265     MpegEncContext * const s = &h->s;
4266     int i, j;
4267     int current_is_long=0;
4268     Picture *pic;
4269
4270     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4271         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4272
4273     for(i=0; i<mmco_count; i++){
4274         if(s->avctx->debug&FF_DEBUG_MMCO)
4275             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4276
4277         switch(mmco[i].opcode){
4278         case MMCO_SHORT2UNUSED:
4279             pic= remove_short(h, mmco[i].short_frame_num);
4280             if(pic)
4281                 unreference_pic(h, pic);
4282             else if(s->avctx->debug&FF_DEBUG_MMCO)
4283                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4284             break;
4285         case MMCO_SHORT2LONG:
4286             pic= remove_long(h, mmco[i].long_index);
4287             if(pic) unreference_pic(h, pic);
4288
4289             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4290             if (h->long_ref[ mmco[i].long_index ]){
4291                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
4292                 h->long_ref_count++;
4293             }
4294             break;
4295         case MMCO_LONG2UNUSED:
4296             pic= remove_long(h, mmco[i].long_index);
4297             if(pic)
4298                 unreference_pic(h, pic);
4299             else if(s->avctx->debug&FF_DEBUG_MMCO)
4300                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4301             break;
4302         case MMCO_LONG:
4303             pic= remove_long(h, mmco[i].long_index);
4304             if(pic) unreference_pic(h, pic);
4305
4306             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4307             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4308             h->long_ref_count++;
4309
4310             current_is_long=1;
4311             break;
4312         case MMCO_SET_MAX_LONG:
4313             assert(mmco[i].long_index <= 16);
4314             // just remove the long term which index is greater than new max
4315             for(j = mmco[i].long_index; j<16; j++){
4316                 pic = remove_long(h, j);
4317                 if (pic) unreference_pic(h, pic);
4318             }
4319             break;
4320         case MMCO_RESET:
4321             while(h->short_ref_count){
4322                 pic= remove_short(h, h->short_ref[0]->frame_num);
4323                 if(pic) unreference_pic(h, pic);
4324             }
4325             for(j = 0; j < 16; j++) {
4326                 pic= remove_long(h, j);
4327                 if(pic) unreference_pic(h, pic);
4328             }
4329             break;
4330         default: assert(0);
4331         }
4332     }
4333
4334     if(!current_is_long){
4335         pic= remove_short(h, s->current_picture_ptr->frame_num);
4336         if(pic){
4337             unreference_pic(h, pic);
4338             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4339         }
4340
4341         if(h->short_ref_count)
4342             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4343
4344         h->short_ref[0]= s->current_picture_ptr;
4345         h->short_ref[0]->long_ref=0;
4346         h->short_ref_count++;
4347     }
4348
4349     print_short_term(h);
4350     print_long_term(h);
4351     return 0;
4352 }
4353
4354 static int decode_ref_pic_marking(H264Context *h){
4355     MpegEncContext * const s = &h->s;
4356     int i;
4357
4358     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4359         s->broken_link= get_bits1(&s->gb) -1;
4360         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4361         if(h->mmco[0].long_index == -1)
4362             h->mmco_index= 0;
4363         else{
4364             h->mmco[0].opcode= MMCO_LONG;
4365             h->mmco_index= 1;
4366         }
4367     }else{
4368         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4369             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4370                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4371
4372                 h->mmco[i].opcode= opcode;
4373                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4374                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4375 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4376                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4377                         return -1;
4378                     }*/
4379                 }
4380                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4381                     unsigned int long_index= get_ue_golomb(&s->gb);
4382                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4383                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4384                         return -1;
4385                     }
4386                     h->mmco[i].long_index= long_index;
4387                 }
4388
4389                 if(opcode > (unsigned)MMCO_LONG){
4390                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4391                     return -1;
4392                 }
4393                 if(opcode == MMCO_END)
4394                     break;
4395             }
4396             h->mmco_index= i;
4397         }else{
4398             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4399
4400             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4401                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4402                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4403                 h->mmco_index= 1;
4404             }else
4405                 h->mmco_index= 0;
4406         }
4407     }
4408
4409     return 0;
4410 }
4411
4412 static int init_poc(H264Context *h){
4413     MpegEncContext * const s = &h->s;
4414     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4415     int field_poc[2];
4416
4417     if(h->nal_unit_type == NAL_IDR_SLICE){
4418         h->frame_num_offset= 0;
4419     }else{
4420         if(h->frame_num < h->prev_frame_num)
4421             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4422         else
4423             h->frame_num_offset= h->prev_frame_num_offset;
4424     }
4425
4426     if(h->sps.poc_type==0){
4427         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4428
4429         if(h->nal_unit_type == NAL_IDR_SLICE){
4430              h->prev_poc_msb=
4431              h->prev_poc_lsb= 0;
4432         }
4433
4434         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4435             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4436         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4437             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4438         else
4439             h->poc_msb = h->prev_poc_msb;
4440 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4441         field_poc[0] =
4442         field_poc[1] = h->poc_msb + h->poc_lsb;
4443         if(s->picture_structure == PICT_FRAME)
4444             field_poc[1] += h->delta_poc_bottom;
4445     }else if(h->sps.poc_type==1){
4446         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4447         int i;
4448
4449         if(h->sps.poc_cycle_length != 0)
4450             abs_frame_num = h->frame_num_offset + h->frame_num;
4451         else
4452             abs_frame_num = 0;
4453
4454         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4455             abs_frame_num--;
4456
4457         expected_delta_per_poc_cycle = 0;
4458         for(i=0; i < h->sps.poc_cycle_length; i++)
4459             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4460
4461         if(abs_frame_num > 0){
4462             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4463             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4464
4465             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4466             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4467                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4468         } else
4469             expectedpoc = 0;
4470
4471         if(h->nal_ref_idc == 0)
4472             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4473
4474         field_poc[0] = expectedpoc + h->delta_poc[0];
4475         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4476
4477         if(s->picture_structure == PICT_FRAME)
4478             field_poc[1] += h->delta_poc[1];
4479     }else{
4480         int poc;
4481         if(h->nal_unit_type == NAL_IDR_SLICE){
4482             poc= 0;
4483         }else{
4484             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4485             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4486         }
4487         field_poc[0]= poc;
4488         field_poc[1]= poc;
4489     }
4490
4491     if(s->picture_structure != PICT_BOTTOM_FIELD)
4492         s->current_picture_ptr->field_poc[0]= field_poc[0];
4493     if(s->picture_structure != PICT_TOP_FIELD)
4494         s->current_picture_ptr->field_poc[1]= field_poc[1];
4495     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4496         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4497
4498     return 0;
4499 }
4500
4501 /**
4502  * decodes a slice header.
4503  * this will allso call MPV_common_init() and frame_start() as needed
4504  */
4505 static int decode_slice_header(H264Context *h){
4506     MpegEncContext * const s = &h->s;
4507     unsigned int first_mb_in_slice;
4508     unsigned int pps_id;
4509     int num_ref_idx_active_override_flag;
4510     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4511     unsigned int slice_type, tmp;
4512     int default_ref_list_done = 0;
4513
4514     s->current_picture.reference= h->nal_ref_idc != 0;
4515     s->dropable= h->nal_ref_idc == 0;
4516
4517     first_mb_in_slice= get_ue_golomb(&s->gb);
4518
4519     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
4520         h->slice_num = 0;
4521         s->current_picture_ptr= NULL;
4522     }
4523
4524     slice_type= get_ue_golomb(&s->gb);
4525     if(slice_type > 9){
4526         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4527         return -1;
4528     }
4529     if(slice_type > 4){
4530         slice_type -= 5;
4531         h->slice_type_fixed=1;
4532     }else
4533         h->slice_type_fixed=0;
4534
4535     slice_type= slice_type_map[ slice_type ];
4536     if (slice_type == I_TYPE
4537         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4538         default_ref_list_done = 1;
4539     }
4540     h->slice_type= slice_type;
4541
4542     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4543
4544     pps_id= get_ue_golomb(&s->gb);
4545     if(pps_id>=MAX_PPS_COUNT){
4546         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4547         return -1;
4548     }
4549     h->pps= h->pps_buffer[pps_id];
4550     if(h->pps.slice_group_count == 0){
4551         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4552         return -1;
4553     }
4554
4555     h->sps= h->sps_buffer[ h->pps.sps_id ];
4556     if(h->sps.log2_max_frame_num == 0){
4557         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4558         return -1;
4559     }
4560
4561     if(h->dequant_coeff_pps != pps_id){
4562         h->dequant_coeff_pps = pps_id;
4563         init_dequant_tables(h);
4564     }
4565
4566     s->mb_width= h->sps.mb_width;
4567     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4568
4569     h->b_stride=  s->mb_width*4;
4570     h->b8_stride= s->mb_width*2;
4571
4572     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4573     if(h->sps.frame_mbs_only_flag)
4574         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4575     else
4576         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4577
4578     if (s->context_initialized
4579         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4580         free_tables(h);
4581         MPV_common_end(s);
4582     }
4583     if (!s->context_initialized) {
4584         if (MPV_common_init(s) < 0)
4585             return -1;
4586
4587         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4588             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4589             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4590         }else{
4591             int i;
4592             for(i=0; i<16; i++){
4593 #define T(x) (x>>2) | ((x<<2) & 0xF)
4594                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4595                 h-> field_scan[i] = T( field_scan[i]);
4596 #undef T
4597             }
4598         }
4599         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4600             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4601             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4602             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4603             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4604         }else{
4605             int i;
4606             for(i=0; i<64; i++){
4607 #define T(x) (x>>3) | ((x&7)<<3)
4608                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4609                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4610                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4611                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4612 #undef T
4613             }
4614         }
4615         if(h->sps.transform_bypass){ //FIXME same ugly
4616             h->zigzag_scan_q0          = zigzag_scan;
4617             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4618             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4619             h->field_scan_q0           = field_scan;
4620             h->field_scan8x8_q0        = field_scan8x8;
4621             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4622         }else{
4623             h->zigzag_scan_q0          = h->zigzag_scan;
4624             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4625             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4626             h->field_scan_q0           = h->field_scan;
4627             h->field_scan8x8_q0        = h->field_scan8x8;
4628             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4629         }
4630
4631         alloc_tables(h);
4632
4633         s->avctx->width = s->width;
4634         s->avctx->height = s->height;
4635         s->avctx->sample_aspect_ratio= h->sps.sar;
4636         if(!s->avctx->sample_aspect_ratio.den)
4637             s->avctx->sample_aspect_ratio.den = 1;
4638
4639         if(h->sps.timing_info_present_flag){
4640             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4641             if(h->x264_build > 0 && h->x264_build < 44)
4642                 s->avctx->time_base.den *= 2;
4643             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4644                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4645         }
4646     }
4647
4648     if(h->slice_num == 0){
4649         if(frame_start(h) < 0)
4650             return -1;
4651     }
4652
4653     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4654     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4655
4656     h->mb_mbaff = 0;
4657     h->mb_aff_frame = 0;
4658     if(h->sps.frame_mbs_only_flag){
4659         s->picture_structure= PICT_FRAME;
4660     }else{
4661         if(get_bits1(&s->gb)) { //field_pic_flag
4662             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4663             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4664         } else {
4665             s->picture_structure= PICT_FRAME;
4666             h->mb_aff_frame = h->sps.mb_aff;
4667         }
4668     }
4669     assert(s->mb_num == s->mb_width * s->mb_height);
4670     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4671        first_mb_in_slice                    >= s->mb_num){
4672         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4673         return -1;
4674     }
4675     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4676     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4677     assert(s->mb_y < s->mb_height);
4678
4679     if(s->picture_structure==PICT_FRAME){
4680         h->curr_pic_num=   h->frame_num;
4681         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4682     }else{
4683         h->curr_pic_num= 2*h->frame_num;
4684         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4685     }
4686
4687     if(h->nal_unit_type == NAL_IDR_SLICE){
4688         get_ue_golomb(&s->gb); /* idr_pic_id */
4689     }
4690
4691     if(h->sps.poc_type==0){
4692         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4693
4694         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4695             h->delta_poc_bottom= get_se_golomb(&s->gb);
4696         }
4697     }
4698
4699     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4700         h->delta_poc[0]= get_se_golomb(&s->gb);
4701
4702         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4703             h->delta_poc[1]= get_se_golomb(&s->gb);
4704     }
4705
4706     init_poc(h);
4707
4708     if(h->pps.redundant_pic_cnt_present){
4709         h->redundant_pic_count= get_ue_golomb(&s->gb);
4710     }
4711
4712     //set defaults, might be overriden a few line later
4713     h->ref_count[0]= h->pps.ref_count[0];
4714     h->ref_count[1]= h->pps.ref_count[1];
4715
4716     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4717         if(h->slice_type == B_TYPE){
4718             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4719             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4720                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4721         }
4722         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4723
4724         if(num_ref_idx_active_override_flag){
4725             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4726             if(h->slice_type==B_TYPE)
4727                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4728
4729             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4730                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4731                 h->ref_count[0]= h->ref_count[1]= 1;
4732                 return -1;
4733             }
4734         }
4735         if(h->slice_type == B_TYPE)
4736             h->list_count= 2;
4737         else
4738             h->list_count= 1;
4739     }else
4740         h->list_count= 0;
4741
4742     if(!default_ref_list_done){
4743         fill_default_ref_list(h);
4744     }
4745
4746     if(decode_ref_pic_list_reordering(h) < 0)
4747         return -1;
4748
4749     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4750        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4751         pred_weight_table(h);
4752     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4753         implicit_weight_table(h);
4754     else
4755         h->use_weight = 0;
4756
4757     if(s->current_picture.reference)
4758         decode_ref_pic_marking(h);
4759
4760     if(FRAME_MBAFF)
4761         fill_mbaff_ref_list(h);
4762
4763     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4764         tmp = get_ue_golomb(&s->gb);
4765         if(tmp > 2){
4766             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4767             return -1;
4768         }
4769         h->cabac_init_idc= tmp;
4770     }
4771
4772     h->last_qscale_diff = 0;
4773     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4774     if(tmp>51){
4775         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4776         return -1;
4777     }
4778     s->qscale= tmp;
4779     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4780     //FIXME qscale / qp ... stuff
4781     if(h->slice_type == SP_TYPE){
4782         get_bits1(&s->gb); /* sp_for_switch_flag */
4783     }
4784     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4785         get_se_golomb(&s->gb); /* slice_qs_delta */
4786     }
4787
4788     h->deblocking_filter = 1;
4789     h->slice_alpha_c0_offset = 0;
4790     h->slice_beta_offset = 0;
4791     if( h->pps.deblocking_filter_parameters_present ) {
4792         tmp= get_ue_golomb(&s->gb);
4793         if(tmp > 2){
4794             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4795             return -1;
4796         }
4797         h->deblocking_filter= tmp;
4798         if(h->deblocking_filter < 2)
4799             h->deblocking_filter^= 1; // 1<->0
4800
4801         if( h->deblocking_filter ) {
4802             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4803             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4804         }
4805     }
4806     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4807        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4808        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4809        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4810         h->deblocking_filter= 0;
4811
4812 #if 0 //FMO
4813     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4814         slice_group_change_cycle= get_bits(&s->gb, ?);
4815 #endif
4816
4817     h->slice_num++;
4818
4819     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4820     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4821
4822     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4823         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4824                h->slice_num,
4825                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4826                first_mb_in_slice,
4827                av_get_pict_type_char(h->slice_type),
4828                pps_id, h->frame_num,
4829                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4830                h->ref_count[0], h->ref_count[1],
4831                s->qscale,
4832                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4833                h->use_weight,
4834                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4835                );
4836     }
4837
4838     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4839         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4840         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4841     }else{
4842         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4843         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4844     }
4845
4846     return 0;
4847 }
4848
4849 /**
4850  *
4851  */
4852 static inline int get_level_prefix(GetBitContext *gb){
4853     unsigned int buf;
4854     int log;
4855
4856     OPEN_READER(re, gb);
4857     UPDATE_CACHE(re, gb);
4858     buf=GET_CACHE(re, gb);
4859
4860     log= 32 - av_log2(buf);
4861 #ifdef TRACE
4862     print_bin(buf>>(32-log), log);
4863     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4864 #endif
4865
4866     LAST_SKIP_BITS(re, gb, log);
4867     CLOSE_READER(re, gb);
4868
4869     return log-1;
4870 }
4871
4872 static inline int get_dct8x8_allowed(H264Context *h){
4873     int i;
4874     for(i=0; i<4; i++){
4875         if(!IS_SUB_8X8(h->sub_mb_type[i])
4876            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4877             return 0;
4878     }
4879     return 1;
4880 }
4881
4882 /**
4883  * decodes a residual block.
4884  * @param n block index
4885  * @param scantable scantable
4886  * @param max_coeff number of coefficients in the block
4887  * @return <0 if an error occured
4888  */
4889 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4890     MpegEncContext * const s = &h->s;
4891     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4892     int level[16];
4893     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4894
4895     //FIXME put trailing_onex into the context
4896
4897     if(n == CHROMA_DC_BLOCK_INDEX){
4898         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4899         total_coeff= coeff_token>>2;
4900     }else{
4901         if(n == LUMA_DC_BLOCK_INDEX){
4902             total_coeff= pred_non_zero_count(h, 0);
4903             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4904             total_coeff= coeff_token>>2;
4905         }else{
4906             total_coeff= pred_non_zero_count(h, n);
4907             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4908             total_coeff= coeff_token>>2;
4909             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4910         }
4911     }
4912
4913     //FIXME set last_non_zero?
4914
4915     if(total_coeff==0)
4916         return 0;
4917     if(total_coeff > (unsigned)max_coeff) {
4918         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4919         return -1;
4920     }
4921
4922     trailing_ones= coeff_token&3;
4923     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4924     assert(total_coeff<=16);
4925
4926     for(i=0; i<trailing_ones; i++){
4927         level[i]= 1 - 2*get_bits1(gb);
4928     }
4929
4930     if(i<total_coeff) {
4931         int level_code, mask;
4932         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4933         int prefix= get_level_prefix(gb);
4934
4935         //first coefficient has suffix_length equal to 0 or 1
4936         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4937             if(suffix_length)
4938                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4939             else
4940                 level_code= (prefix<<suffix_length); //part
4941         }else if(prefix==14){
4942             if(suffix_length)
4943                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4944             else
4945                 level_code= prefix + get_bits(gb, 4); //part
4946         }else if(prefix==15){
4947             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4948             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4949         }else{
4950             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4951             return -1;
4952         }
4953
4954         if(trailing_ones < 3) level_code += 2;
4955
4956         suffix_length = 1;
4957         if(level_code > 5)
4958             suffix_length++;
4959         mask= -(level_code&1);
4960         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4961         i++;
4962
4963         //remaining coefficients have suffix_length > 0
4964         for(;i<total_coeff;i++) {
4965             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4966             prefix = get_level_prefix(gb);
4967             if(prefix<15){
4968                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4969             }else if(prefix==15){
4970                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4971             }else{
4972                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4973                 return -1;
4974             }
4975             mask= -(level_code&1);
4976             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4977             if(level_code > suffix_limit[suffix_length])
4978                 suffix_length++;
4979         }
4980     }
4981
4982     if(total_coeff == max_coeff)
4983         zeros_left=0;
4984     else{
4985         if(n == CHROMA_DC_BLOCK_INDEX)
4986             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4987         else
4988             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4989     }
4990
4991     coeff_num = zeros_left + total_coeff - 1;
4992     j = scantable[coeff_num];
4993     if(n > 24){
4994         block[j] = level[0];
4995         for(i=1;i<total_coeff;i++) {
4996             if(zeros_left <= 0)
4997                 run_before = 0;
4998             else if(zeros_left < 7){
4999                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5000             }else{
5001                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5002             }
5003             zeros_left -= run_before;
5004             coeff_num -= 1 + run_before;
5005             j= scantable[ coeff_num ];
5006
5007             block[j]= level[i];
5008         }
5009     }else{
5010         block[j] = (level[0] * qmul[j] + 32)>>6;
5011         for(i=1;i<total_coeff;i++) {
5012             if(zeros_left <= 0)
5013                 run_before = 0;
5014             else if(zeros_left < 7){
5015                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5016             }else{
5017                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5018             }
5019             zeros_left -= run_before;
5020             coeff_num -= 1 + run_before;
5021             j= scantable[ coeff_num ];
5022
5023             block[j]= (level[i] * qmul[j] + 32)>>6;
5024         }
5025     }
5026
5027     if(zeros_left<0){
5028         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5029         return -1;
5030     }
5031
5032     return 0;
5033 }
5034
5035 static void predict_field_decoding_flag(H264Context *h){
5036     MpegEncContext * const s = &h->s;
5037     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5038     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5039                 ? s->current_picture.mb_type[mb_xy-1]
5040                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5041                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
5042                 : 0;
5043     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5044 }
5045
5046 /**
5047  * decodes a P_SKIP or B_SKIP macroblock
5048  */
5049 static void decode_mb_skip(H264Context *h){
5050     MpegEncContext * const s = &h->s;
5051     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5052     int mb_type=0;
5053
5054     memset(h->non_zero_count[mb_xy], 0, 16);
5055     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5056
5057     if(MB_FIELD)
5058         mb_type|= MB_TYPE_INTERLACED;
5059
5060     if( h->slice_type == B_TYPE )
5061     {
5062         // just for fill_caches. pred_direct_motion will set the real mb_type
5063         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
5064
5065         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5066         pred_direct_motion(h, &mb_type);
5067         mb_type|= MB_TYPE_SKIP;
5068     }
5069     else
5070     {
5071         int mx, my;
5072         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5073
5074         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5075         pred_pskip_motion(h, &mx, &my);
5076         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5077         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5078     }
5079
5080     write_back_motion(h, mb_type);
5081     s->current_picture.mb_type[mb_xy]= mb_type;
5082     s->current_picture.qscale_table[mb_xy]= s->qscale;
5083     h->slice_table[ mb_xy ]= h->slice_num;
5084     h->prev_mb_skipped= 1;
5085 }
5086
5087 /**
5088  * decodes a macroblock
5089  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5090  */
5091 static int decode_mb_cavlc(H264Context *h){
5092     MpegEncContext * const s = &h->s;
5093     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5094     int partition_count;
5095     unsigned int mb_type, cbp;
5096     int dct8x8_allowed= h->pps.transform_8x8_mode;
5097
5098     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5099
5100     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5101     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5102                 down the code */
5103     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5104         if(s->mb_skip_run==-1)
5105             s->mb_skip_run= get_ue_golomb(&s->gb);
5106
5107         if (s->mb_skip_run--) {
5108             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5109                 if(s->mb_skip_run==0)
5110                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5111                 else
5112                     predict_field_decoding_flag(h);
5113             }
5114             decode_mb_skip(h);
5115             return 0;
5116         }
5117     }
5118     if(FRAME_MBAFF){
5119         if( (s->mb_y&1) == 0 )
5120             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5121     }else
5122         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5123
5124     h->prev_mb_skipped= 0;
5125
5126     mb_type= get_ue_golomb(&s->gb);
5127     if(h->slice_type == B_TYPE){
5128         if(mb_type < 23){
5129             partition_count= b_mb_type_info[mb_type].partition_count;
5130             mb_type=         b_mb_type_info[mb_type].type;
5131         }else{
5132             mb_type -= 23;
5133             goto decode_intra_mb;
5134         }
5135     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5136         if(mb_type < 5){
5137             partition_count= p_mb_type_info[mb_type].partition_count;
5138             mb_type=         p_mb_type_info[mb_type].type;
5139         }else{
5140             mb_type -= 5;
5141             goto decode_intra_mb;
5142         }
5143     }else{
5144        assert(h->slice_type == I_TYPE);
5145 decode_intra_mb:
5146         if(mb_type > 25){
5147             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5148             return -1;
5149         }
5150         partition_count=0;
5151         cbp= i_mb_type_info[mb_type].cbp;
5152         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5153         mb_type= i_mb_type_info[mb_type].type;
5154     }
5155
5156     if(MB_FIELD)
5157         mb_type |= MB_TYPE_INTERLACED;
5158
5159     h->slice_table[ mb_xy ]= h->slice_num;
5160
5161     if(IS_INTRA_PCM(mb_type)){
5162         unsigned int x, y;
5163
5164         // we assume these blocks are very rare so we dont optimize it
5165         align_get_bits(&s->gb);
5166
5167         // The pixels are stored in the same order as levels in h->mb array.
5168         for(y=0; y<16; y++){
5169             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5170             for(x=0; x<16; x++){
5171                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5172                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5173             }
5174         }
5175         for(y=0; y<8; y++){
5176             const int index= 256 + 4*(y&3) + 32*(y>>2);
5177             for(x=0; x<8; x++){
5178                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5179                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5180             }
5181         }
5182         for(y=0; y<8; y++){
5183             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5184             for(x=0; x<8; x++){
5185                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5186                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5187             }
5188         }
5189
5190         // In deblocking, the quantizer is 0
5191         s->current_picture.qscale_table[mb_xy]= 0;
5192         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5193         // All coeffs are present
5194         memset(h->non_zero_count[mb_xy], 16, 16);
5195
5196         s->current_picture.mb_type[mb_xy]= mb_type;
5197         return 0;
5198     }
5199
5200     if(MB_MBAFF){
5201         h->ref_count[0] <<= 1;
5202         h->ref_count[1] <<= 1;
5203     }
5204
5205     fill_caches(h, mb_type, 0);
5206
5207     //mb_pred
5208     if(IS_INTRA(mb_type)){
5209             int pred_mode;
5210 //            init_top_left_availability(h);
5211             if(IS_INTRA4x4(mb_type)){
5212                 int i;
5213                 int di = 1;
5214                 if(dct8x8_allowed && get_bits1(&s->gb)){
5215                     mb_type |= MB_TYPE_8x8DCT;
5216                     di = 4;
5217                 }
5218
5219 //                fill_intra4x4_pred_table(h);
5220                 for(i=0; i<16; i+=di){
5221                     int mode= pred_intra_mode(h, i);
5222
5223                     if(!get_bits1(&s->gb)){
5224                         const int rem_mode= get_bits(&s->gb, 3);
5225                         mode = rem_mode + (rem_mode >= mode);
5226                     }
5227
5228                     if(di==4)
5229                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5230                     else
5231                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5232                 }
5233                 write_back_intra_pred_mode(h);
5234                 if( check_intra4x4_pred_mode(h) < 0)
5235                     return -1;
5236             }else{
5237                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5238                 if(h->intra16x16_pred_mode < 0)
5239                     return -1;
5240             }
5241
5242             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
5243             if(pred_mode < 0)
5244                 return -1;
5245             h->chroma_pred_mode= pred_mode;
5246     }else if(partition_count==4){
5247         int i, j, sub_partition_count[4], list, ref[2][4];
5248
5249         if(h->slice_type == B_TYPE){
5250             for(i=0; i<4; i++){
5251                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5252                 if(h->sub_mb_type[i] >=13){
5253                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5254                     return -1;
5255                 }
5256                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5257                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5258             }
5259             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5260                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5261                 pred_direct_motion(h, &mb_type);
5262                 h->ref_cache[0][scan8[4]] =
5263                 h->ref_cache[1][scan8[4]] =
5264                 h->ref_cache[0][scan8[12]] =
5265                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5266             }
5267         }else{
5268             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5269             for(i=0; i<4; i++){
5270                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5271                 if(h->sub_mb_type[i] >=4){
5272                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5273                     return -1;
5274                 }
5275                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5276                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5277             }
5278         }
5279
5280         for(list=0; list<h->list_count; list++){
5281             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5282             for(i=0; i<4; i++){
5283                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5284                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5285                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5286                     if(tmp>=ref_count){
5287                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
5288                         return -1;
5289                     }
5290                     ref[list][i]= tmp;
5291                 }else{
5292                  //FIXME
5293                     ref[list][i] = -1;
5294                 }
5295             }
5296         }
5297
5298         if(dct8x8_allowed)
5299             dct8x8_allowed = get_dct8x8_allowed(h);
5300
5301         for(list=0; list<h->list_count; list++){
5302             for(i=0; i<4; i++){
5303                 if(IS_DIRECT(h->sub_mb_type[i])) {
5304                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5305                     continue;
5306                 }
5307                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5308                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5309
5310                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5311                     const int sub_mb_type= h->sub_mb_type[i];
5312                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5313                     for(j=0; j<sub_partition_count[i]; j++){
5314                         int mx, my;
5315                         const int index= 4*i + block_width*j;
5316                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5317                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5318                         mx += get_se_golomb(&s->gb);
5319                         my += get_se_golomb(&s->gb);
5320                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5321
5322                         if(IS_SUB_8X8(sub_mb_type)){
5323                             mv_cache[ 1 ][0]=
5324                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5325                             mv_cache[ 1 ][1]=
5326                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5327                         }else if(IS_SUB_8X4(sub_mb_type)){
5328                             mv_cache[ 1 ][0]= mx;
5329                             mv_cache[ 1 ][1]= my;
5330                         }else if(IS_SUB_4X8(sub_mb_type)){
5331                             mv_cache[ 8 ][0]= mx;
5332                             mv_cache[ 8 ][1]= my;
5333                         }
5334                         mv_cache[ 0 ][0]= mx;
5335                         mv_cache[ 0 ][1]= my;
5336                     }
5337                 }else{
5338                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5339                     p[0] = p[1]=
5340                     p[8] = p[9]= 0;
5341                 }
5342             }
5343         }
5344     }else if(IS_DIRECT(mb_type)){
5345         pred_direct_motion(h, &mb_type);
5346         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5347     }else{
5348         int list, mx, my, i;
5349          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5350         if(IS_16X16(mb_type)){
5351             for(list=0; list<h->list_count; list++){
5352                     unsigned int val;
5353                     if(IS_DIR(mb_type, 0, list)){
5354                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
5355                         if(val >= h->ref_count[list]){
5356                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5357                             return -1;
5358                         }
5359                     }else
5360                         val= LIST_NOT_USED&0xFF;
5361                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5362             }
5363             for(list=0; list<h->list_count; list++){
5364                 unsigned int val;
5365                 if(IS_DIR(mb_type, 0, list)){
5366                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5367                     mx += get_se_golomb(&s->gb);
5368                     my += get_se_golomb(&s->gb);
5369                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5370
5371                     val= pack16to32(mx,my);
5372                 }else
5373                     val=0;
5374                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
5375             }
5376         }
5377         else if(IS_16X8(mb_type)){
5378             for(list=0; list<h->list_count; list++){
5379                     for(i=0; i<2; i++){
5380                         unsigned int val;
5381                         if(IS_DIR(mb_type, i, list)){
5382                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5383                             if(val >= h->ref_count[list]){
5384                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5385                                 return -1;
5386                             }
5387                         }else
5388                             val= LIST_NOT_USED&0xFF;
5389                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5390                     }
5391             }
5392             for(list=0; list<h->list_count; list++){
5393                 for(i=0; i<2; i++){
5394                     unsigned int val;
5395                     if(IS_DIR(mb_type, i, list)){
5396                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5397                         mx += get_se_golomb(&s->gb);
5398                         my += get_se_golomb(&s->gb);
5399                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5400
5401                         val= pack16to32(mx,my);
5402                     }else
5403                         val=0;
5404                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
5405                 }
5406             }
5407         }else{
5408             assert(IS_8X16(mb_type));
5409             for(list=0; list<h->list_count; list++){
5410                     for(i=0; i<2; i++){
5411                         unsigned int val;
5412                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5413                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5414                             if(val >= h->ref_count[list]){
5415                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5416                                 return -1;
5417                             }
5418                         }else
5419                             val= LIST_NOT_USED&0xFF;
5420                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5421                     }
5422             }
5423             for(list=0; list<h->list_count; list++){
5424                 for(i=0; i<2; i++){
5425                     unsigned int val;
5426                     if(IS_DIR(mb_type, i, list)){
5427                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5428                         mx += get_se_golomb(&s->gb);
5429                         my += get_se_golomb(&s->gb);
5430                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5431
5432                         val= pack16to32(mx,my);
5433                     }else
5434                         val=0;
5435                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
5436                 }
5437             }
5438         }
5439     }
5440
5441     if(IS_INTER(mb_type))
5442         write_back_motion(h, mb_type);
5443
5444     if(!IS_INTRA16x16(mb_type)){
5445         cbp= get_ue_golomb(&s->gb);
5446         if(cbp > 47){
5447             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5448             return -1;
5449         }
5450
5451         if(IS_INTRA4x4(mb_type))
5452             cbp= golomb_to_intra4x4_cbp[cbp];
5453         else
5454             cbp= golomb_to_inter_cbp[cbp];
5455     }
5456     h->cbp = cbp;
5457
5458     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5459         if(get_bits1(&s->gb))
5460             mb_type |= MB_TYPE_8x8DCT;
5461     }
5462     s->current_picture.mb_type[mb_xy]= mb_type;
5463
5464     if(cbp || IS_INTRA16x16(mb_type)){
5465         int i8x8, i4x4, chroma_idx;
5466         int chroma_qp, dquant;
5467         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5468         const uint8_t *scan, *scan8x8, *dc_scan;
5469
5470 //        fill_non_zero_count_cache(h);
5471
5472         if(IS_INTERLACED(mb_type)){
5473             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5474             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5475             dc_scan= luma_dc_field_scan;
5476         }else{
5477             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5478             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5479             dc_scan= luma_dc_zigzag_scan;
5480         }
5481
5482         dquant= get_se_golomb(&s->gb);
5483
5484         if( dquant > 25 || dquant < -26 ){
5485             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5486             return -1;
5487         }
5488
5489         s->qscale += dquant;
5490         if(((unsigned)s->qscale) > 51){
5491             if(s->qscale<0) s->qscale+= 52;
5492             else            s->qscale-= 52;
5493         }
5494
5495         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5496         if(IS_INTRA16x16(mb_type)){
5497             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5498                 return -1; //FIXME continue if partitioned and other return -1 too
5499             }
5500
5501             assert((cbp&15) == 0 || (cbp&15) == 15);
5502
5503             if(cbp&15){
5504                 for(i8x8=0; i8x8<4; i8x8++){
5505                     for(i4x4=0; i4x4<4; i4x4++){
5506                         const int index= i4x4 + 4*i8x8;
5507                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5508                             return -1;
5509                         }
5510                     }
5511                 }
5512             }else{
5513                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5514             }
5515         }else{
5516             for(i8x8=0; i8x8<4; i8x8++){
5517                 if(cbp & (1<<i8x8)){
5518                     if(IS_8x8DCT(mb_type)){
5519                         DCTELEM *buf = &h->mb[64*i8x8];
5520                         uint8_t *nnz;
5521                         for(i4x4=0; i4x4<4; i4x4++){
5522                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5523                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5524                                 return -1;
5525                         }
5526                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5527                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5528                     }else{
5529                         for(i4x4=0; i4x4<4; i4x4++){
5530                             const int index= i4x4 + 4*i8x8;
5531
5532                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5533                                 return -1;
5534                             }
5535                         }
5536                     }
5537                 }else{
5538                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5539                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5540                 }
5541             }
5542         }
5543
5544         if(cbp&0x30){
5545             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5546                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5547                     return -1;
5548                 }
5549         }
5550
5551         if(cbp&0x20){
5552             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5553                 for(i4x4=0; i4x4<4; i4x4++){
5554                     const int index= 16 + 4*chroma_idx + i4x4;
5555                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5556                         return -1;
5557                     }
5558                 }
5559             }
5560         }else{
5561             uint8_t * const nnz= &h->non_zero_count_cache[0];
5562             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5563             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5564         }
5565     }else{
5566         uint8_t * const nnz= &h->non_zero_count_cache[0];
5567         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5568         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5569         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5570     }
5571     s->current_picture.qscale_table[mb_xy]= s->qscale;
5572     write_back_non_zero_count(h);
5573
5574     if(MB_MBAFF){
5575         h->ref_count[0] >>= 1;
5576         h->ref_count[1] >>= 1;
5577     }
5578
5579     return 0;
5580 }
5581
5582 static int decode_cabac_field_decoding_flag(H264Context *h) {
5583     MpegEncContext * const s = &h->s;
5584     const int mb_x = s->mb_x;
5585     const int mb_y = s->mb_y & ~1;
5586     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5587     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5588
5589     unsigned int ctx = 0;
5590
5591     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5592         ctx += 1;
5593     }
5594     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5595         ctx += 1;
5596     }
5597
5598     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5599 }
5600
5601 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5602     uint8_t *state= &h->cabac_state[ctx_base];
5603     int mb_type;
5604
5605     if(intra_slice){
5606         MpegEncContext * const s = &h->s;
5607         const int mba_xy = h->left_mb_xy[0];
5608         const int mbb_xy = h->top_mb_xy;
5609         int ctx=0;
5610         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5611             ctx++;
5612         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5613             ctx++;
5614         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5615             return 0;   /* I4x4 */
5616         state += 2;
5617     }else{
5618         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5619             return 0;   /* I4x4 */
5620     }
5621
5622     if( get_cabac_terminate( &h->cabac ) )
5623         return 25;  /* PCM */
5624
5625     mb_type = 1; /* I16x16 */
5626     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5627     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5628         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5629     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5630     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5631     return mb_type;
5632 }
5633
5634 static int decode_cabac_mb_type( H264Context *h ) {
5635     MpegEncContext * const s = &h->s;
5636
5637     if( h->slice_type == I_TYPE ) {
5638         return decode_cabac_intra_mb_type(h, 3, 1);
5639     } else if( h->slice_type == P_TYPE ) {
5640         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5641             /* P-type */
5642             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5643                 /* P_L0_D16x16, P_8x8 */
5644                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5645             } else {
5646                 /* P_L0_D8x16, P_L0_D16x8 */
5647                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5648             }
5649         } else {
5650             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5651         }
5652     } else if( h->slice_type == B_TYPE ) {
5653         const int mba_xy = h->left_mb_xy[0];
5654         const int mbb_xy = h->top_mb_xy;
5655         int ctx = 0;
5656         int bits;
5657
5658         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5659             ctx++;
5660         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5661             ctx++;
5662
5663         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5664             return 0; /* B_Direct_16x16 */
5665
5666         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5667             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5668         }
5669
5670         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5671         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5672         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5673         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5674         if( bits < 8 )
5675             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5676         else if( bits == 13 ) {
5677             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5678         } else if( bits == 14 )
5679             return 11; /* B_L1_L0_8x16 */
5680         else if( bits == 15 )
5681             return 22; /* B_8x8 */
5682
5683         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5684         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5685     } else {
5686         /* TODO SI/SP frames? */
5687         return -1;
5688     }
5689 }
5690
5691 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5692     MpegEncContext * const s = &h->s;
5693     int mba_xy, mbb_xy;
5694     int ctx = 0;
5695
5696     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5697         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5698         mba_xy = mb_xy - 1;
5699         if( (mb_y&1)
5700             && h->slice_table[mba_xy] == h->slice_num
5701             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5702             mba_xy += s->mb_stride;
5703         if( MB_FIELD ){
5704             mbb_xy = mb_xy - s->mb_stride;
5705             if( !(mb_y&1)
5706                 && h->slice_table[mbb_xy] == h->slice_num
5707                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5708                 mbb_xy -= s->mb_stride;
5709         }else
5710             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5711     }else{
5712         int mb_xy = mb_x + mb_y*s->mb_stride;
5713         mba_xy = mb_xy - 1;
5714         mbb_xy = mb_xy - s->mb_stride;
5715     }
5716
5717     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5718         ctx++;
5719     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5720         ctx++;
5721
5722     if( h->slice_type == B_TYPE )
5723         ctx += 13;
5724     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5725 }
5726
5727 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5728     int mode = 0;
5729
5730     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5731         return pred_mode;
5732
5733     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5734     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5735     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5736
5737     if( mode >= pred_mode )
5738         return mode + 1;
5739     else
5740         return mode;
5741 }
5742
5743 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5744     const int mba_xy = h->left_mb_xy[0];
5745     const int mbb_xy = h->top_mb_xy;
5746
5747     int ctx = 0;
5748
5749     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5750     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5751         ctx++;
5752
5753     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5754         ctx++;
5755
5756     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5757         return 0;
5758
5759     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5760         return 1;
5761     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5762         return 2;
5763     else
5764         return 3;
5765 }
5766
5767 static const uint8_t block_idx_x[16] = {
5768     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5769 };
5770 static const uint8_t block_idx_y[16] = {
5771     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5772 };
5773 static const uint8_t block_idx_xy[4][4] = {
5774     { 0, 2, 8,  10},
5775     { 1, 3, 9,  11},
5776     { 4, 6, 12, 14},
5777     { 5, 7, 13, 15}
5778 };
5779
5780 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5781     int cbp = 0;
5782     int cbp_b = -1;
5783     int i8x8;
5784
5785     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5786         cbp_b = h->top_cbp;
5787         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
5788     }
5789
5790     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5791         int cbp_a = -1;
5792         int x, y;
5793         int ctx = 0;
5794
5795         x = block_idx_x[4*i8x8];
5796         y = block_idx_y[4*i8x8];
5797
5798         if( x > 0 )
5799             cbp_a = cbp;
5800         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5801             cbp_a = h->left_cbp;
5802             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
5803         }
5804
5805         if( y > 0 )
5806             cbp_b = cbp;
5807
5808         /* No need to test for skip as we put 0 for skip block */
5809         /* No need to test for IPCM as we put 1 for IPCM block */
5810         if( cbp_a >= 0 ) {
5811             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5812             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5813                 ctx++;
5814         }
5815
5816         if( cbp_b >= 0 ) {
5817             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5818             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5819                 ctx += 2;
5820         }
5821
5822         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5823             cbp |= 1 << i8x8;
5824         }
5825     }
5826     return cbp;
5827 }
5828 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5829     int ctx;
5830     int cbp_a, cbp_b;
5831
5832     cbp_a = (h->left_cbp>>4)&0x03;
5833     cbp_b = (h-> top_cbp>>4)&0x03;
5834
5835     ctx = 0;
5836     if( cbp_a > 0 ) ctx++;
5837     if( cbp_b > 0 ) ctx += 2;
5838     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5839         return 0;
5840
5841     ctx = 4;
5842     if( cbp_a == 2 ) ctx++;
5843     if( cbp_b == 2 ) ctx += 2;
5844     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5845 }
5846 static int decode_cabac_mb_dqp( H264Context *h) {
5847     MpegEncContext * const s = &h->s;
5848     int mbn_xy;
5849     int   ctx = 0;
5850     int   val = 0;
5851
5852     if( s->mb_x > 0 )
5853         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5854     else
5855         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5856
5857     if( h->last_qscale_diff != 0 )
5858         ctx++;
5859
5860     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5861         if( ctx < 2 )
5862             ctx = 2;
5863         else
5864             ctx = 3;
5865         val++;
5866         if(val > 102) //prevent infinite loop
5867             return INT_MIN;
5868     }
5869
5870     if( val&0x01 )
5871         return (val + 1)/2;
5872     else
5873         return -(val + 1)/2;
5874 }
5875 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5876     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5877         return 0;   /* 8x8 */
5878     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5879         return 1;   /* 8x4 */
5880     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5881         return 2;   /* 4x8 */
5882     return 3;       /* 4x4 */
5883 }
5884 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5885     int type;
5886     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5887         return 0;   /* B_Direct_8x8 */
5888     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5889         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5890     type = 3;
5891     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5892         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5893             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5894         type += 4;
5895     }
5896     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5897     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5898     return type;
5899 }
5900
5901 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5902     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5903 }
5904
5905 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5906     int refa = h->ref_cache[list][scan8[n] - 1];
5907     int refb = h->ref_cache[list][scan8[n] - 8];
5908     int ref  = 0;
5909     int ctx  = 0;
5910
5911     if( h->slice_type == B_TYPE) {
5912         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5913             ctx++;
5914         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5915             ctx += 2;
5916     } else {
5917         if( refa > 0 )
5918             ctx++;
5919         if( refb > 0 )
5920             ctx += 2;
5921     }
5922
5923     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5924         ref++;
5925         if( ctx < 4 )
5926             ctx = 4;
5927         else
5928             ctx = 5;
5929         if(ref >= 32 /*h->ref_list[list]*/){
5930             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5931             return 0; //FIXME we should return -1 and check the return everywhere
5932         }
5933     }
5934     return ref;
5935 }
5936
5937 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5938     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5939                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5940     int ctxbase = (l == 0) ? 40 : 47;
5941     int ctx, mvd;
5942
5943     if( amvd < 3 )
5944         ctx = 0;
5945     else if( amvd > 32 )
5946         ctx = 2;
5947     else
5948         ctx = 1;
5949
5950     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5951         return 0;
5952
5953     mvd= 1;
5954     ctx= 3;
5955     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5956         mvd++;
5957         if( ctx < 6 )
5958             ctx++;
5959     }
5960
5961     if( mvd >= 9 ) {
5962         int k = 3;
5963         while( get_cabac_bypass( &h->cabac ) ) {
5964             mvd += 1 << k;
5965             k++;
5966             if(k>24){
5967                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5968                 return INT_MIN;
5969             }
5970         }
5971         while( k-- ) {
5972             if( get_cabac_bypass( &h->cabac ) )
5973                 mvd += 1 << k;
5974         }
5975     }
5976     return get_cabac_bypass_sign( &h->cabac, -mvd );
5977 }
5978
5979 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5980     int nza, nzb;
5981     int ctx = 0;
5982
5983     if( cat == 0 ) {
5984         nza = h->left_cbp&0x100;
5985         nzb = h-> top_cbp&0x100;
5986     } else if( cat == 1 || cat == 2 ) {
5987         nza = h->non_zero_count_cache[scan8[idx] - 1];
5988         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5989     } else if( cat == 3 ) {
5990         nza = (h->left_cbp>>(6+idx))&0x01;
5991         nzb = (h-> top_cbp>>(6+idx))&0x01;
5992     } else {
5993         assert(cat == 4);
5994         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5995         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5996     }
5997
5998     if( nza > 0 )
5999         ctx++;
6000
6001     if( nzb > 0 )
6002         ctx += 2;
6003
6004     return ctx + 4 * cat;
6005 }
6006
6007 static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
6008     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6009     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6010     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
6011     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
6012 };
6013
6014 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
6015     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
6016     static const int significant_coeff_flag_offset[2][6] = {
6017       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
6018       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
6019     };
6020     static const int last_coeff_flag_offset[2][6] = {
6021       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
6022       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
6023     };
6024     static const int coeff_abs_level_m1_offset[6] = {
6025         227+0, 227+10, 227+20, 227+30, 227+39, 426
6026     };
6027     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
6028       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6029         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6030         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6031        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6032       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6033         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6034         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6035         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6036     };
6037
6038     int index[64];
6039
6040     int last;
6041     int coeff_count = 0;
6042
6043     int abslevel1 = 1;
6044     int abslevelgt1 = 0;
6045
6046     uint8_t *significant_coeff_ctx_base;
6047     uint8_t *last_coeff_ctx_base;
6048     uint8_t *abs_level_m1_ctx_base;
6049
6050 #ifndef ARCH_X86
6051 #define CABAC_ON_STACK
6052 #endif
6053 #ifdef CABAC_ON_STACK
6054 #define CC &cc
6055     CABACContext cc;
6056     cc.range     = h->cabac.range;
6057     cc.low       = h->cabac.low;
6058     cc.bytestream= h->cabac.bytestream;
6059 #else
6060 #define CC &h->cabac
6061 #endif
6062
6063
6064     /* cat: 0-> DC 16x16  n = 0
6065      *      1-> AC 16x16  n = luma4x4idx
6066      *      2-> Luma4x4   n = luma4x4idx
6067      *      3-> DC Chroma n = iCbCr
6068      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6069      *      5-> Luma8x8   n = 4 * luma8x8idx
6070      */
6071
6072     /* read coded block flag */
6073     if( cat != 5 ) {
6074         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6075             if( cat == 1 || cat == 2 )
6076                 h->non_zero_count_cache[scan8[n]] = 0;
6077             else if( cat == 4 )
6078                 h->non_zero_count_cache[scan8[16+n]] = 0;
6079 #ifdef CABAC_ON_STACK
6080             h->cabac.range     = cc.range     ;
6081             h->cabac.low       = cc.low       ;
6082             h->cabac.bytestream= cc.bytestream;
6083 #endif
6084             return 0;
6085         }
6086     }
6087
6088     significant_coeff_ctx_base = h->cabac_state
6089         + significant_coeff_flag_offset[MB_FIELD][cat];
6090     last_coeff_ctx_base = h->cabac_state
6091         + last_coeff_flag_offset[MB_FIELD][cat];
6092     abs_level_m1_ctx_base = h->cabac_state
6093         + coeff_abs_level_m1_offset[cat];
6094
6095     if( cat == 5 ) {
6096 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6097         for(last= 0; last < coefs; last++) { \
6098             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6099             if( get_cabac( CC, sig_ctx )) { \
6100                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6101                 index[coeff_count++] = last; \
6102                 if( get_cabac( CC, last_ctx ) ) { \
6103                     last= max_coeff; \
6104                     break; \
6105                 } \
6106             } \
6107         }\
6108         if( last == max_coeff -1 ) {\
6109             index[coeff_count++] = last;\
6110         }
6111         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6112 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
6113         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
6114     } else {
6115         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
6116 #else
6117         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6118     } else {
6119         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6120 #endif
6121     }
6122     assert(coeff_count > 0);
6123
6124     if( cat == 0 )
6125         h->cbp_table[mb_xy] |= 0x100;
6126     else if( cat == 1 || cat == 2 )
6127         h->non_zero_count_cache[scan8[n]] = coeff_count;
6128     else if( cat == 3 )
6129         h->cbp_table[mb_xy] |= 0x40 << n;
6130     else if( cat == 4 )
6131         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6132     else {
6133         assert( cat == 5 );
6134         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6135     }
6136
6137     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
6138         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6139         int j= scantable[index[coeff_count]];
6140
6141         if( get_cabac( CC, ctx ) == 0 ) {
6142             if( !qmul ) {
6143                 block[j] = get_cabac_bypass_sign( CC, -1);
6144             }else{
6145                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
6146             }
6147
6148             abslevel1++;
6149         } else {
6150             int coeff_abs = 2;
6151             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6152             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
6153                 coeff_abs++;
6154             }
6155
6156             if( coeff_abs >= 15 ) {
6157                 int j = 0;
6158                 while( get_cabac_bypass( CC ) ) {
6159                     j++;
6160                 }
6161
6162                 coeff_abs=1;
6163                 while( j-- ) {
6164                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
6165                 }
6166                 coeff_abs+= 14;
6167             }
6168
6169             if( !qmul ) {
6170                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
6171                 else                                block[j] =  coeff_abs;
6172             }else{
6173                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6174                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6175             }
6176
6177             abslevelgt1++;
6178         }
6179     }
6180 #ifdef CABAC_ON_STACK
6181             h->cabac.range     = cc.range     ;
6182             h->cabac.low       = cc.low       ;
6183             h->cabac.bytestream= cc.bytestream;
6184 #endif
6185     return 0;
6186 }
6187
6188 static inline void compute_mb_neighbors(H264Context *h)
6189 {
6190     MpegEncContext * const s = &h->s;
6191     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6192     h->top_mb_xy     = mb_xy - s->mb_stride;
6193     h->left_mb_xy[0] = mb_xy - 1;
6194     if(FRAME_MBAFF){
6195         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6196         const int top_pair_xy      = pair_xy     - s->mb_stride;
6197         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6198         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6199         const int curr_mb_frame_flag = !MB_FIELD;
6200         const int bottom = (s->mb_y & 1);
6201         if (bottom
6202                 ? !curr_mb_frame_flag // bottom macroblock
6203                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6204                 ) {
6205             h->top_mb_xy -= s->mb_stride;
6206         }
6207         if (left_mb_frame_flag != curr_mb_frame_flag) {
6208             h->left_mb_xy[0] = pair_xy - 1;
6209         }
6210     }
6211     return;
6212 }
6213
6214 /**
6215  * decodes a macroblock
6216  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6217  */
6218 static int decode_mb_cabac(H264Context *h) {
6219     MpegEncContext * const s = &h->s;
6220     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6221     int mb_type, partition_count, cbp = 0;
6222     int dct8x8_allowed= h->pps.transform_8x8_mode;
6223
6224     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6225
6226     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6227     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6228         int skip;
6229         /* a skipped mb needs the aff flag from the following mb */
6230         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6231             predict_field_decoding_flag(h);
6232         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6233             skip = h->next_mb_skipped;
6234         else
6235             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6236         /* read skip flags */
6237         if( skip ) {
6238             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6239                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6240                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6241                 if(h->next_mb_skipped)
6242                     predict_field_decoding_flag(h);
6243                 else
6244                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6245             }
6246
6247             decode_mb_skip(h);
6248
6249             h->cbp_table[mb_xy] = 0;
6250             h->chroma_pred_mode_table[mb_xy] = 0;
6251             h->last_qscale_diff = 0;
6252
6253             return 0;
6254
6255         }
6256     }
6257     if(FRAME_MBAFF){
6258         if( (s->mb_y&1) == 0 )
6259             h->mb_mbaff =
6260             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6261     }else
6262         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6263
6264     h->prev_mb_skipped = 0;
6265
6266     compute_mb_neighbors(h);
6267     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6268         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6269         return -1;
6270     }
6271
6272     if( h->slice_type == B_TYPE ) {
6273         if( mb_type < 23 ){
6274             partition_count= b_mb_type_info[mb_type].partition_count;
6275             mb_type=         b_mb_type_info[mb_type].type;
6276         }else{
6277             mb_type -= 23;
6278             goto decode_intra_mb;
6279         }
6280     } else if( h->slice_type == P_TYPE ) {
6281         if( mb_type < 5) {
6282             partition_count= p_mb_type_info[mb_type].partition_count;
6283             mb_type=         p_mb_type_info[mb_type].type;
6284         } else {
6285             mb_type -= 5;
6286             goto decode_intra_mb;
6287         }
6288     } else {
6289        assert(h->slice_type == I_TYPE);
6290 decode_intra_mb:
6291         partition_count = 0;
6292         cbp= i_mb_type_info[mb_type].cbp;
6293         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6294         mb_type= i_mb_type_info[mb_type].type;
6295     }
6296     if(MB_FIELD)
6297         mb_type |= MB_TYPE_INTERLACED;
6298
6299     h->slice_table[ mb_xy ]= h->slice_num;
6300
6301     if(IS_INTRA_PCM(mb_type)) {
6302         const uint8_t *ptr;
6303         unsigned int x, y;
6304
6305         // We assume these blocks are very rare so we dont optimize it.
6306         // FIXME The two following lines get the bitstream position in the cabac
6307         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6308         ptr= h->cabac.bytestream;
6309         if(h->cabac.low&0x1) ptr--;
6310         if(CABAC_BITS==16){
6311             if(h->cabac.low&0x1FF) ptr--;
6312         }
6313
6314         // The pixels are stored in the same order as levels in h->mb array.
6315         for(y=0; y<16; y++){
6316             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6317             for(x=0; x<16; x++){
6318                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
6319                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6320             }
6321         }
6322         for(y=0; y<8; y++){
6323             const int index= 256 + 4*(y&3) + 32*(y>>2);
6324             for(x=0; x<8; x++){
6325                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6326                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6327             }
6328         }
6329         for(y=0; y<8; y++){
6330             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6331             for(x=0; x<8; x++){
6332                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6333                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6334             }
6335         }
6336
6337         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6338
6339         // All blocks are present
6340         h->cbp_table[mb_xy] = 0x1ef;
6341         h->chroma_pred_mode_table[mb_xy] = 0;
6342         // In deblocking, the quantizer is 0
6343         s->current_picture.qscale_table[mb_xy]= 0;
6344         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6345         // All coeffs are present
6346         memset(h->non_zero_count[mb_xy], 16, 16);
6347         s->current_picture.mb_type[mb_xy]= mb_type;
6348         return 0;
6349     }
6350
6351     if(MB_MBAFF){
6352         h->ref_count[0] <<= 1;
6353         h->ref_count[1] <<= 1;
6354     }
6355
6356     fill_caches(h, mb_type, 0);
6357
6358     if( IS_INTRA( mb_type ) ) {
6359         int i, pred_mode;
6360         if( IS_INTRA4x4( mb_type ) ) {
6361             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6362                 mb_type |= MB_TYPE_8x8DCT;
6363                 for( i = 0; i < 16; i+=4 ) {
6364                     int pred = pred_intra_mode( h, i );
6365                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6366                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6367                 }
6368             } else {
6369                 for( i = 0; i < 16; i++ ) {
6370                     int pred = pred_intra_mode( h, i );
6371                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6372
6373                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6374                 }
6375             }
6376             write_back_intra_pred_mode(h);
6377             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6378         } else {
6379             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6380             if( h->intra16x16_pred_mode < 0 ) return -1;
6381         }
6382         h->chroma_pred_mode_table[mb_xy] =
6383         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6384
6385         pred_mode= check_intra_pred_mode( h, pred_mode );
6386         if( pred_mode < 0 ) return -1;
6387         h->chroma_pred_mode= pred_mode;
6388     } else if( partition_count == 4 ) {
6389         int i, j, sub_partition_count[4], list, ref[2][4];
6390
6391         if( h->slice_type == B_TYPE ) {
6392             for( i = 0; i < 4; i++ ) {
6393                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6394                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6395                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6396             }
6397             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6398                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6399                 pred_direct_motion(h, &mb_type);
6400                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6401                     for( i = 0; i < 4; i++ )
6402                         if( IS_DIRECT(h->sub_mb_type[i]) )
6403                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6404                 }
6405             }
6406         } else {
6407             for( i = 0; i < 4; i++ ) {
6408                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6409                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6410                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6411             }
6412         }
6413
6414         for( list = 0; list < h->list_count; list++ ) {
6415                 for( i = 0; i < 4; i++ ) {
6416                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6417                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6418                         if( h->ref_count[list] > 1 )
6419                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6420                         else
6421                             ref[list][i] = 0;
6422                     } else {
6423                         ref[list][i] = -1;
6424                     }
6425                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6426                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6427                 }
6428         }
6429
6430         if(dct8x8_allowed)
6431             dct8x8_allowed = get_dct8x8_allowed(h);
6432
6433         for(list=0; list<h->list_count; list++){
6434             for(i=0; i<4; i++){
6435                 if(IS_DIRECT(h->sub_mb_type[i])){
6436                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6437                     continue;
6438                 }
6439                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6440
6441                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6442                     const int sub_mb_type= h->sub_mb_type[i];
6443                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6444                     for(j=0; j<sub_partition_count[i]; j++){
6445                         int mpx, mpy;
6446                         int mx, my;
6447                         const int index= 4*i + block_width*j;
6448                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6449                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6450                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6451
6452                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6453                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6454                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6455
6456                         if(IS_SUB_8X8(sub_mb_type)){
6457                             mv_cache[ 1 ][0]=
6458                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6459                             mv_cache[ 1 ][1]=
6460                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6461
6462                             mvd_cache[ 1 ][0]=
6463                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6464                             mvd_cache[ 1 ][1]=
6465                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6466                         }else if(IS_SUB_8X4(sub_mb_type)){
6467                             mv_cache[ 1 ][0]= mx;
6468                             mv_cache[ 1 ][1]= my;
6469
6470                             mvd_cache[ 1 ][0]= mx - mpx;
6471                             mvd_cache[ 1 ][1]= my - mpy;
6472                         }else if(IS_SUB_4X8(sub_mb_type)){
6473                             mv_cache[ 8 ][0]= mx;
6474                             mv_cache[ 8 ][1]= my;
6475
6476                             mvd_cache[ 8 ][0]= mx - mpx;
6477                             mvd_cache[ 8 ][1]= my - mpy;
6478                         }
6479                         mv_cache[ 0 ][0]= mx;
6480                         mv_cache[ 0 ][1]= my;
6481
6482                         mvd_cache[ 0 ][0]= mx - mpx;
6483                         mvd_cache[ 0 ][1]= my - mpy;
6484                     }
6485                 }else{
6486                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6487                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6488                     p[0] = p[1] = p[8] = p[9] = 0;
6489                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6490                 }
6491             }
6492         }
6493     } else if( IS_DIRECT(mb_type) ) {
6494         pred_direct_motion(h, &mb_type);
6495         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6496         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6497         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6498     } else {
6499         int list, mx, my, i, mpx, mpy;
6500         if(IS_16X16(mb_type)){
6501             for(list=0; list<h->list_count; list++){
6502                 if(IS_DIR(mb_type, 0, list)){
6503                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6504                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6505                 }else
6506                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
6507             }
6508             for(list=0; list<h->list_count; list++){
6509                 if(IS_DIR(mb_type, 0, list)){
6510                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6511
6512                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6513                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6514                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6515
6516                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6517                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6518                 }else
6519                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6520             }
6521         }
6522         else if(IS_16X8(mb_type)){
6523             for(list=0; list<h->list_count; list++){
6524                     for(i=0; i<2; i++){
6525                         if(IS_DIR(mb_type, i, list)){
6526                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6527                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6528                         }else
6529                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6530                     }
6531             }
6532             for(list=0; list<h->list_count; list++){
6533                 for(i=0; i<2; i++){
6534                     if(IS_DIR(mb_type, i, list)){
6535                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6536                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6537                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6538                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6539
6540                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6541                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6542                     }else{
6543                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6544                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6545                     }
6546                 }
6547             }
6548         }else{
6549             assert(IS_8X16(mb_type));
6550             for(list=0; list<h->list_count; list++){
6551                     for(i=0; i<2; i++){
6552                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6553                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6554                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6555                         }else
6556                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6557                     }
6558             }
6559             for(list=0; list<h->list_count; list++){
6560                 for(i=0; i<2; i++){
6561                     if(IS_DIR(mb_type, i, list)){
6562                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6563                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6564                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6565
6566                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6567                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6568                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6569                     }else{
6570                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6571                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6572                     }
6573                 }
6574             }
6575         }
6576     }
6577
6578    if( IS_INTER( mb_type ) ) {
6579         h->chroma_pred_mode_table[mb_xy] = 0;
6580         write_back_motion( h, mb_type );
6581    }
6582
6583     if( !IS_INTRA16x16( mb_type ) ) {
6584         cbp  = decode_cabac_mb_cbp_luma( h );
6585         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6586     }
6587
6588     h->cbp_table[mb_xy] = h->cbp = cbp;
6589
6590     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6591         if( decode_cabac_mb_transform_size( h ) )
6592             mb_type |= MB_TYPE_8x8DCT;
6593     }
6594     s->current_picture.mb_type[mb_xy]= mb_type;
6595
6596     if( cbp || IS_INTRA16x16( mb_type ) ) {
6597         const uint8_t *scan, *scan8x8, *dc_scan;
6598         int dqp;
6599
6600         if(IS_INTERLACED(mb_type)){
6601             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6602             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6603             dc_scan= luma_dc_field_scan;
6604         }else{
6605             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6606             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6607             dc_scan= luma_dc_zigzag_scan;
6608         }
6609
6610         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6611         if( dqp == INT_MIN ){
6612             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6613             return -1;
6614         }
6615         s->qscale += dqp;
6616         if(((unsigned)s->qscale) > 51){
6617             if(s->qscale<0) s->qscale+= 52;
6618             else            s->qscale-= 52;
6619         }
6620         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6621
6622         if( IS_INTRA16x16( mb_type ) ) {
6623             int i;
6624             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6625             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6626                 return -1;
6627             if( cbp&15 ) {
6628                 for( i = 0; i < 16; i++ ) {
6629                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6630                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6631                         return -1;
6632                 }
6633             } else {
6634                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6635             }
6636         } else {
6637             int i8x8, i4x4;
6638             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6639                 if( cbp & (1<<i8x8) ) {
6640                     if( IS_8x8DCT(mb_type) ) {
6641                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6642                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6643                             return -1;
6644                     } else
6645                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6646                         const int index = 4*i8x8 + i4x4;
6647                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6648 //START_TIMER
6649                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6650                             return -1;
6651 //STOP_TIMER("decode_residual")
6652                     }
6653                 } else {
6654                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6655                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6656                 }
6657             }
6658         }
6659
6660         if( cbp&0x30 ){
6661             int c;
6662             for( c = 0; c < 2; c++ ) {
6663                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6664                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6665                     return -1;
6666             }
6667         }
6668
6669         if( cbp&0x20 ) {
6670             int c, i;
6671             for( c = 0; c < 2; c++ ) {
6672                 for( i = 0; i < 4; i++ ) {
6673                     const int index = 16 + 4 * c + i;
6674                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6675                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6676                         return -1;
6677                 }
6678             }
6679         } else {
6680             uint8_t * const nnz= &h->non_zero_count_cache[0];
6681             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6682             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6683         }
6684     } else {
6685         uint8_t * const nnz= &h->non_zero_count_cache[0];
6686         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6687         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6688         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6689         h->last_qscale_diff = 0;
6690     }
6691
6692     s->current_picture.qscale_table[mb_xy]= s->qscale;
6693     write_back_non_zero_count(h);
6694
6695     if(MB_MBAFF){
6696         h->ref_count[0] >>= 1;
6697         h->ref_count[1] >>= 1;
6698     }
6699
6700     return 0;
6701 }
6702
6703
6704 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6705     int i, d;
6706     const int index_a = qp + h->slice_alpha_c0_offset;
6707     const int alpha = (alpha_table+52)[index_a];
6708     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6709
6710     if( bS[0] < 4 ) {
6711         int8_t tc[4];
6712         for(i=0; i<4; i++)
6713             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6714         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6715     } else {
6716         /* 16px edge length, because bS=4 is triggered by being at
6717          * the edge of an intra MB, so all 4 bS are the same */
6718             for( d = 0; d < 16; d++ ) {
6719                 const int p0 = pix[-1];
6720                 const int p1 = pix[-2];
6721                 const int p2 = pix[-3];
6722
6723                 const int q0 = pix[0];
6724                 const int q1 = pix[1];
6725                 const int q2 = pix[2];
6726
6727                 if( FFABS( p0 - q0 ) < alpha &&
6728                     FFABS( p1 - p0 ) < beta &&
6729                     FFABS( q1 - q0 ) < beta ) {
6730
6731                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6732                         if( FFABS( p2 - p0 ) < beta)
6733                         {
6734                             const int p3 = pix[-4];
6735                             /* p0', p1', p2' */
6736                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6737                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6738                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6739                         } else {
6740                             /* p0' */
6741                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6742                         }
6743                         if( FFABS( q2 - q0 ) < beta)
6744                         {
6745                             const int q3 = pix[3];
6746                             /* q0', q1', q2' */
6747                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6748                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6749                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6750                         } else {
6751                             /* q0' */
6752                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6753                         }
6754                     }else{
6755                         /* p0', q0' */
6756                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6757                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6758                     }
6759                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6760                 }
6761                 pix += stride;
6762             }
6763     }
6764 }
6765 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6766     int i;
6767     const int index_a = qp + h->slice_alpha_c0_offset;
6768     const int alpha = (alpha_table+52)[index_a];
6769     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6770
6771     if( bS[0] < 4 ) {
6772         int8_t tc[4];
6773         for(i=0; i<4; i++)
6774             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6775         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6776     } else {
6777         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6778     }
6779 }
6780
6781 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6782     int i;
6783     for( i = 0; i < 16; i++, pix += stride) {
6784         int index_a;
6785         int alpha;
6786         int beta;
6787
6788         int qp_index;
6789         int bS_index = (i >> 1);
6790         if (!MB_FIELD) {
6791             bS_index &= ~1;
6792             bS_index |= (i & 1);
6793         }
6794
6795         if( bS[bS_index] == 0 ) {
6796             continue;
6797         }
6798
6799         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6800         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6801         alpha = (alpha_table+52)[index_a];
6802         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6803
6804         if( bS[bS_index] < 4 ) {
6805             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6806             const int p0 = pix[-1];
6807             const int p1 = pix[-2];
6808             const int p2 = pix[-3];
6809             const int q0 = pix[0];
6810             const int q1 = pix[1];
6811             const int q2 = pix[2];
6812
6813             if( FFABS( p0 - q0 ) < alpha &&
6814                 FFABS( p1 - p0 ) < beta &&
6815                 FFABS( q1 - q0 ) < beta ) {
6816                 int tc = tc0;
6817                 int i_delta;
6818
6819                 if( FFABS( p2 - p0 ) < beta ) {
6820                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6821                     tc++;
6822                 }
6823                 if( FFABS( q2 - q0 ) < beta ) {
6824                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6825                     tc++;
6826                 }
6827
6828                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6829                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6830                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6831                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6832             }
6833         }else{
6834             const int p0 = pix[-1];
6835             const int p1 = pix[-2];
6836             const int p2 = pix[-3];
6837
6838             const int q0 = pix[0];
6839             const int q1 = pix[1];
6840             const int q2 = pix[2];
6841
6842             if( FFABS( p0 - q0 ) < alpha &&
6843                 FFABS( p1 - p0 ) < beta &&
6844                 FFABS( q1 - q0 ) < beta ) {
6845
6846                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6847                     if( FFABS( p2 - p0 ) < beta)
6848                     {
6849                         const int p3 = pix[-4];
6850                         /* p0', p1', p2' */
6851                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6852                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6853                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6854                     } else {
6855                         /* p0' */
6856                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6857                     }
6858                     if( FFABS( q2 - q0 ) < beta)
6859                     {
6860                         const int q3 = pix[3];
6861                         /* q0', q1', q2' */
6862                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6863                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6864                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6865                     } else {
6866                         /* q0' */
6867                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6868                     }
6869                 }else{
6870                     /* p0', q0' */
6871                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6872                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6873                 }
6874                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6875             }
6876         }
6877     }
6878 }
6879 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6880     int i;
6881     for( i = 0; i < 8; i++, pix += stride) {
6882         int index_a;
6883         int alpha;
6884         int beta;
6885
6886         int qp_index;
6887         int bS_index = i;
6888
6889         if( bS[bS_index] == 0 ) {
6890             continue;
6891         }
6892
6893         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6894         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6895         alpha = (alpha_table+52)[index_a];
6896         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6897
6898         if( bS[bS_index] < 4 ) {
6899             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6900             const int p0 = pix[-1];
6901             const int p1 = pix[-2];
6902             const int q0 = pix[0];
6903             const int q1 = pix[1];
6904
6905             if( FFABS( p0 - q0 ) < alpha &&
6906                 FFABS( p1 - p0 ) < beta &&
6907                 FFABS( q1 - q0 ) < beta ) {
6908                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6909
6910                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6911                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6912                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6913             }
6914         }else{
6915             const int p0 = pix[-1];
6916             const int p1 = pix[-2];
6917             const int q0 = pix[0];
6918             const int q1 = pix[1];
6919
6920             if( FFABS( p0 - q0 ) < alpha &&
6921                 FFABS( p1 - p0 ) < beta &&
6922                 FFABS( q1 - q0 ) < beta ) {
6923
6924                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6925                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6926                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6927             }
6928         }
6929     }
6930 }
6931
6932 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6933     int i, d;
6934     const int index_a = qp + h->slice_alpha_c0_offset;
6935     const int alpha = (alpha_table+52)[index_a];
6936     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6937     const int pix_next  = stride;
6938
6939     if( bS[0] < 4 ) {
6940         int8_t tc[4];
6941         for(i=0; i<4; i++)
6942             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6943         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6944     } else {
6945         /* 16px edge length, see filter_mb_edgev */
6946             for( d = 0; d < 16; d++ ) {
6947                 const int p0 = pix[-1*pix_next];
6948                 const int p1 = pix[-2*pix_next];
6949                 const int p2 = pix[-3*pix_next];
6950                 const int q0 = pix[0];
6951                 const int q1 = pix[1*pix_next];
6952                 const int q2 = pix[2*pix_next];
6953
6954                 if( FFABS( p0 - q0 ) < alpha &&
6955                     FFABS( p1 - p0 ) < beta &&
6956                     FFABS( q1 - q0 ) < beta ) {
6957
6958                     const int p3 = pix[-4*pix_next];
6959                     const int q3 = pix[ 3*pix_next];
6960
6961                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6962                         if( FFABS( p2 - p0 ) < beta) {
6963                             /* p0', p1', p2' */
6964                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6965                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6966                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6967                         } else {
6968                             /* p0' */
6969                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6970                         }
6971                         if( FFABS( q2 - q0 ) < beta) {
6972                             /* q0', q1', q2' */
6973                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6974                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6975                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6976                         } else {
6977                             /* q0' */
6978                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6979                         }
6980                     }else{
6981                         /* p0', q0' */
6982                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6983                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6984                     }
6985                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6986                 }
6987                 pix++;
6988             }
6989     }
6990 }
6991
6992 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6993     int i;
6994     const int index_a = qp + h->slice_alpha_c0_offset;
6995     const int alpha = (alpha_table+52)[index_a];
6996     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6997
6998     if( bS[0] < 4 ) {
6999         int8_t tc[4];
7000         for(i=0; i<4; i++)
7001             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
7002         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
7003     } else {
7004         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
7005     }
7006 }
7007
7008 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7009     MpegEncContext * const s = &h->s;
7010     int mb_xy, mb_type;
7011     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
7012
7013     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
7014         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
7015         return;
7016     }
7017     assert(!FRAME_MBAFF);
7018
7019     mb_xy = mb_x + mb_y*s->mb_stride;
7020     mb_type = s->current_picture.mb_type[mb_xy];
7021     qp = s->current_picture.qscale_table[mb_xy];
7022     qp0 = s->current_picture.qscale_table[mb_xy-1];
7023     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
7024     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
7025     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
7026     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
7027     qp0 = (qp + qp0 + 1) >> 1;
7028     qp1 = (qp + qp1 + 1) >> 1;
7029     qpc0 = (qpc + qpc0 + 1) >> 1;
7030     qpc1 = (qpc + qpc1 + 1) >> 1;
7031     qp_thresh = 15 - h->slice_alpha_c0_offset;
7032     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
7033        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
7034         return;
7035
7036     if( IS_INTRA(mb_type) ) {
7037         int16_t bS4[4] = {4,4,4,4};
7038         int16_t bS3[4] = {3,3,3,3};
7039         if( IS_8x8DCT(mb_type) ) {
7040             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7041             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7042             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7043             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7044         } else {
7045             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
7046             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
7047             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
7048             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
7049             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
7050             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
7051             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
7052             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
7053         }
7054         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
7055         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
7056         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
7057         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
7058         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7059         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
7060         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
7061         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
7062         return;
7063     } else {
7064         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
7065         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
7066         int edges;
7067         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
7068             edges = 4;
7069             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
7070         } else {
7071             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
7072                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
7073             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
7074                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
7075                              ? 3 : 0;
7076             int step = IS_8x8DCT(mb_type) ? 2 : 1;
7077             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
7078             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
7079                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
7080         }
7081         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
7082             bSv[0][0] = 0x0004000400040004ULL;
7083         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
7084             bSv[1][0] = 0x0004000400040004ULL;
7085
7086 #define FILTER(hv,dir,edge)\
7087         if(bSv[dir][edge]) {\
7088             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
7089             if(!(edge&1)) {\
7090                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7091                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7092             }\
7093         }
7094         if( edges == 1 ) {
7095             FILTER(v,0,0);
7096             FILTER(h,1,0);
7097         } else if( IS_8x8DCT(mb_type) ) {
7098             FILTER(v,0,0);
7099             FILTER(v,0,2);
7100             FILTER(h,1,0);
7101             FILTER(h,1,2);
7102         } else {
7103             FILTER(v,0,0);
7104             FILTER(v,0,1);
7105             FILTER(v,0,2);
7106             FILTER(v,0,3);
7107             FILTER(h,1,0);
7108             FILTER(h,1,1);
7109             FILTER(h,1,2);
7110             FILTER(h,1,3);
7111         }
7112 #undef FILTER
7113     }
7114 }
7115
7116 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7117     MpegEncContext * const s = &h->s;
7118     const int mb_xy= mb_x + mb_y*s->mb_stride;
7119     const int mb_type = s->current_picture.mb_type[mb_xy];
7120     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7121     int first_vertical_edge_done = 0;
7122     int dir;
7123     /* FIXME: A given frame may occupy more than one position in
7124      * the reference list. So ref2frm should be populated with
7125      * frame numbers, not indices. */
7126     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7127                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7128
7129     //for sufficiently low qp, filtering wouldn't do anything
7130     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7131     if(!FRAME_MBAFF){
7132         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7133         int qp = s->current_picture.qscale_table[mb_xy];
7134         if(qp <= qp_thresh
7135            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7136            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7137             return;
7138         }
7139     }
7140
7141     if (FRAME_MBAFF
7142             // left mb is in picture
7143             && h->slice_table[mb_xy-1] != 255
7144             // and current and left pair do not have the same interlaced type
7145             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7146             // and left mb is in the same slice if deblocking_filter == 2
7147             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7148         /* First vertical edge is different in MBAFF frames
7149          * There are 8 different bS to compute and 2 different Qp
7150          */
7151         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7152         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7153         int16_t bS[8];
7154         int qp[2];
7155         int chroma_qp[2];
7156         int mb_qp, mbn0_qp, mbn1_qp;
7157         int i;
7158         first_vertical_edge_done = 1;
7159
7160         if( IS_INTRA(mb_type) )
7161             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7162         else {
7163             for( i = 0; i < 8; i++ ) {
7164                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7165
7166                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7167                     bS[i] = 4;
7168                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7169                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7170                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7171                     bS[i] = 2;
7172                 else
7173                     bS[i] = 1;
7174             }
7175         }
7176
7177         mb_qp = s->current_picture.qscale_table[mb_xy];
7178         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7179         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7180         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7181         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7182                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7183         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7184         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7185                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7186
7187         /* Filter edge */
7188         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7189         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7190         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7191         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7192         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7193     }
7194     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7195     for( dir = 0; dir < 2; dir++ )
7196     {
7197         int edge;
7198         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7199         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7200         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7201
7202         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7203                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7204         // how often to recheck mv-based bS when iterating between edges
7205         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7206                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7207         // how often to recheck mv-based bS when iterating along each edge
7208         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7209
7210         if (first_vertical_edge_done) {
7211             start = 1;
7212             first_vertical_edge_done = 0;
7213         }
7214
7215         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7216             start = 1;
7217
7218         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7219             && !IS_INTERLACED(mb_type)
7220             && IS_INTERLACED(mbm_type)
7221             ) {
7222             // This is a special case in the norm where the filtering must
7223             // be done twice (one each of the field) even if we are in a
7224             // frame macroblock.
7225             //
7226             static const int nnz_idx[4] = {4,5,6,3};
7227             unsigned int tmp_linesize   = 2 *   linesize;
7228             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7229             int mbn_xy = mb_xy - 2 * s->mb_stride;
7230             int qp, chroma_qp;
7231             int i, j;
7232             int16_t bS[4];
7233
7234             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7235                 if( IS_INTRA(mb_type) ||
7236                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7237                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7238                 } else {
7239                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7240                     for( i = 0; i < 4; i++ ) {
7241                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7242                             mbn_nnz[nnz_idx[i]] != 0 )
7243                             bS[i] = 2;
7244                         else
7245                             bS[i] = 1;
7246                     }
7247                 }
7248                 // Do not use s->qscale as luma quantizer because it has not the same
7249                 // value in IPCM macroblocks.
7250                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7251                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7252                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7253                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7254                 chroma_qp = ( h->chroma_qp +
7255                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7256                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7257                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7258             }
7259
7260             start = 1;
7261         }
7262
7263         /* Calculate bS */
7264         for( edge = start; edge < edges; edge++ ) {
7265             /* mbn_xy: neighbor macroblock */
7266             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7267             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7268             int16_t bS[4];
7269             int qp;
7270
7271             if( (edge&1) && IS_8x8DCT(mb_type) )
7272                 continue;
7273
7274             if( IS_INTRA(mb_type) ||
7275                 IS_INTRA(mbn_type) ) {
7276                 int value;
7277                 if (edge == 0) {
7278                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7279                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7280                     ) {
7281                         value = 4;
7282                     } else {
7283                         value = 3;
7284                     }
7285                 } else {
7286                     value = 3;
7287                 }
7288                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7289             } else {
7290                 int i, l;
7291                 int mv_done;
7292
7293                 if( edge & mask_edge ) {
7294                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7295                     mv_done = 1;
7296                 }
7297                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7298                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7299                     mv_done = 1;
7300                 }
7301                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7302                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7303                     int bn_idx= b_idx - (dir ? 8:1);
7304                     int v = 0;
7305                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7306                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7307                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7308                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7309                     }
7310                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7311                     mv_done = 1;
7312                 }
7313                 else
7314                     mv_done = 0;
7315
7316                 for( i = 0; i < 4; i++ ) {
7317                     int x = dir == 0 ? edge : i;
7318                     int y = dir == 0 ? i    : edge;
7319                     int b_idx= 8 + 4 + x + 8*y;
7320                     int bn_idx= b_idx - (dir ? 8:1);
7321
7322                     if( h->non_zero_count_cache[b_idx] != 0 ||
7323                         h->non_zero_count_cache[bn_idx] != 0 ) {
7324                         bS[i] = 2;
7325                     }
7326                     else if(!mv_done)
7327                     {
7328                         bS[i] = 0;
7329                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7330                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7331                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7332                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7333                                 bS[i] = 1;
7334                                 break;
7335                             }
7336                         }
7337                     }
7338                 }
7339
7340                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7341                     continue;
7342             }
7343
7344             /* Filter edge */
7345             // Do not use s->qscale as luma quantizer because it has not the same
7346             // value in IPCM macroblocks.
7347             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7348             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7349             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7350             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7351             if( dir == 0 ) {
7352                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7353                 if( (edge&1) == 0 ) {
7354                     int chroma_qp = ( h->chroma_qp +
7355                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7356                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7357                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7358                 }
7359             } else {
7360                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7361                 if( (edge&1) == 0 ) {
7362                     int chroma_qp = ( h->chroma_qp +
7363                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7364                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7365                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7366                 }
7367             }
7368         }
7369     }
7370 }
7371
7372 static int decode_slice(H264Context *h){
7373     MpegEncContext * const s = &h->s;
7374     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7375
7376     s->mb_skip_run= -1;
7377
7378     if( h->pps.cabac ) {
7379         int i;
7380
7381         /* realign */
7382         align_get_bits( &s->gb );
7383
7384         /* init cabac */
7385         ff_init_cabac_states( &h->cabac);
7386         ff_init_cabac_decoder( &h->cabac,
7387                                s->gb.buffer + get_bits_count(&s->gb)/8,
7388                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7389         /* calculate pre-state */
7390         for( i= 0; i < 460; i++ ) {
7391             int pre;
7392             if( h->slice_type == I_TYPE )
7393                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7394             else
7395                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7396
7397             if( pre <= 63 )
7398                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7399             else
7400                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7401         }
7402
7403         for(;;){
7404 //START_TIMER
7405             int ret = decode_mb_cabac(h);
7406             int eos;
7407 //STOP_TIMER("decode_mb_cabac")
7408
7409             if(ret>=0) hl_decode_mb(h);
7410
7411             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7412                 s->mb_y++;
7413
7414                 if(ret>=0) ret = decode_mb_cabac(h);
7415
7416                 if(ret>=0) hl_decode_mb(h);
7417                 s->mb_y--;
7418             }
7419             eos = get_cabac_terminate( &h->cabac );
7420
7421             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7422                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7423                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7424                 return -1;
7425             }
7426
7427             if( ++s->mb_x >= s->mb_width ) {
7428                 s->mb_x = 0;
7429                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7430                 ++s->mb_y;
7431                 if(FRAME_MBAFF) {
7432                     ++s->mb_y;
7433                 }
7434             }
7435
7436             if( eos || s->mb_y >= s->mb_height ) {
7437                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7438                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7439                 return 0;
7440             }
7441         }
7442
7443     } else {
7444         for(;;){
7445             int ret = decode_mb_cavlc(h);
7446
7447             if(ret>=0) hl_decode_mb(h);
7448
7449             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7450                 s->mb_y++;
7451                 ret = decode_mb_cavlc(h);
7452
7453                 if(ret>=0) hl_decode_mb(h);
7454                 s->mb_y--;
7455             }
7456
7457             if(ret<0){
7458                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7459                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7460
7461                 return -1;
7462             }
7463
7464             if(++s->mb_x >= s->mb_width){
7465                 s->mb_x=0;
7466                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7467                 ++s->mb_y;
7468                 if(FRAME_MBAFF) {
7469                     ++s->mb_y;
7470                 }
7471                 if(s->mb_y >= s->mb_height){
7472                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7473
7474                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7475                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7476
7477                         return 0;
7478                     }else{
7479                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7480
7481                         return -1;
7482                     }
7483                 }
7484             }
7485
7486             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7487                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7488                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7489                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7490
7491                     return 0;
7492                 }else{
7493                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7494
7495                     return -1;
7496                 }
7497             }
7498         }
7499     }
7500
7501 #if 0
7502     for(;s->mb_y < s->mb_height; s->mb_y++){
7503         for(;s->mb_x < s->mb_width; s->mb_x++){
7504             int ret= decode_mb(h);
7505
7506             hl_decode_mb(h);
7507
7508             if(ret<0){
7509                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7510                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7511
7512                 return -1;
7513             }
7514
7515             if(++s->mb_x >= s->mb_width){
7516                 s->mb_x=0;
7517                 if(++s->mb_y >= s->mb_height){
7518                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7519                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7520
7521                         return 0;
7522                     }else{
7523                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7524
7525                         return -1;
7526                     }
7527                 }
7528             }
7529
7530             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7531                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7532                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7533
7534                     return 0;
7535                 }else{
7536                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7537
7538                     return -1;
7539                 }
7540             }
7541         }
7542         s->mb_x=0;
7543         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7544     }
7545 #endif
7546     return -1; //not reached
7547 }
7548
7549 static int decode_unregistered_user_data(H264Context *h, int size){
7550     MpegEncContext * const s = &h->s;
7551     uint8_t user_data[16+256];
7552     int e, build, i;
7553
7554     if(size<16)
7555         return -1;
7556
7557     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7558         user_data[i]= get_bits(&s->gb, 8);
7559     }
7560
7561     user_data[i]= 0;
7562     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7563     if(e==1 && build>=0)
7564         h->x264_build= build;
7565
7566     if(s->avctx->debug & FF_DEBUG_BUGS)
7567         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7568
7569     for(; i<size; i++)
7570         skip_bits(&s->gb, 8);
7571
7572     return 0;
7573 }
7574
7575 static int decode_sei(H264Context *h){
7576     MpegEncContext * const s = &h->s;
7577
7578     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7579         int size, type;
7580
7581         type=0;
7582         do{
7583             type+= show_bits(&s->gb, 8);
7584         }while(get_bits(&s->gb, 8) == 255);
7585
7586         size=0;
7587         do{
7588             size+= show_bits(&s->gb, 8);
7589         }while(get_bits(&s->gb, 8) == 255);
7590
7591         switch(type){
7592         case 5:
7593             if(decode_unregistered_user_data(h, size) < 0)
7594                 return -1;
7595             break;
7596         default:
7597             skip_bits(&s->gb, 8*size);
7598         }
7599
7600         //FIXME check bits here
7601         align_get_bits(&s->gb);
7602     }
7603
7604     return 0;
7605 }
7606
7607 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7608     MpegEncContext * const s = &h->s;
7609     int cpb_count, i;
7610     cpb_count = get_ue_golomb(&s->gb) + 1;
7611     get_bits(&s->gb, 4); /* bit_rate_scale */
7612     get_bits(&s->gb, 4); /* cpb_size_scale */
7613     for(i=0; i<cpb_count; i++){
7614         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7615         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7616         get_bits1(&s->gb);     /* cbr_flag */
7617     }
7618     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7619     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7620     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7621     get_bits(&s->gb, 5); /* time_offset_length */
7622 }
7623
7624 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7625     MpegEncContext * const s = &h->s;
7626     int aspect_ratio_info_present_flag;
7627     unsigned int aspect_ratio_idc;
7628     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7629
7630     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7631
7632     if( aspect_ratio_info_present_flag ) {
7633         aspect_ratio_idc= get_bits(&s->gb, 8);
7634         if( aspect_ratio_idc == EXTENDED_SAR ) {
7635             sps->sar.num= get_bits(&s->gb, 16);
7636             sps->sar.den= get_bits(&s->gb, 16);
7637         }else if(aspect_ratio_idc < 14){
7638             sps->sar=  pixel_aspect[aspect_ratio_idc];
7639         }else{
7640             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7641             return -1;
7642         }
7643     }else{
7644         sps->sar.num=
7645         sps->sar.den= 0;
7646     }
7647 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7648
7649     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7650         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7651     }
7652
7653     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7654         get_bits(&s->gb, 3);    /* video_format */
7655         get_bits1(&s->gb);      /* video_full_range_flag */
7656         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7657             get_bits(&s->gb, 8); /* colour_primaries */
7658             get_bits(&s->gb, 8); /* transfer_characteristics */
7659             get_bits(&s->gb, 8); /* matrix_coefficients */
7660         }
7661     }
7662
7663     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7664         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7665         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7666     }
7667
7668     sps->timing_info_present_flag = get_bits1(&s->gb);
7669     if(sps->timing_info_present_flag){
7670         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7671         sps->time_scale = get_bits_long(&s->gb, 32);
7672         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7673     }
7674
7675     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7676     if(nal_hrd_parameters_present_flag)
7677         decode_hrd_parameters(h, sps);
7678     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7679     if(vcl_hrd_parameters_present_flag)
7680         decode_hrd_parameters(h, sps);
7681     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7682         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7683     get_bits1(&s->gb);         /* pic_struct_present_flag */
7684
7685     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7686     if(sps->bitstream_restriction_flag){
7687         unsigned int num_reorder_frames;
7688         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7689         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7690         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7691         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7692         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7693         num_reorder_frames= get_ue_golomb(&s->gb);
7694         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7695
7696         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7697             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7698             return -1;
7699         }
7700
7701         sps->num_reorder_frames= num_reorder_frames;
7702     }
7703
7704     return 0;
7705 }
7706
7707 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7708                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7709     MpegEncContext * const s = &h->s;
7710     int i, last = 8, next = 8;
7711     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7712     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7713         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7714     else
7715     for(i=0;i<size;i++){
7716         if(next)
7717             next = (last + get_se_golomb(&s->gb)) & 0xff;
7718         if(!i && !next){ /* matrix not written, we use the preset one */
7719             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7720             break;
7721         }
7722         last = factors[scan[i]] = next ? next : last;
7723     }
7724 }
7725
7726 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7727                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7728     MpegEncContext * const s = &h->s;
7729     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7730     const uint8_t *fallback[4] = {
7731         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7732         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7733         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7734         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7735     };
7736     if(get_bits1(&s->gb)){
7737         sps->scaling_matrix_present |= is_sps;
7738         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7739         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7740         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7741         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7742         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7743         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7744         if(is_sps || pps->transform_8x8_mode){
7745             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7746             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7747         }
7748     } else if(fallback_sps) {
7749         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7750         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7751     }
7752 }
7753
7754 static inline int decode_seq_parameter_set(H264Context *h){
7755     MpegEncContext * const s = &h->s;
7756     int profile_idc, level_idc;
7757     unsigned int sps_id, tmp, mb_width, mb_height;
7758     int i;
7759     SPS *sps;
7760
7761     profile_idc= get_bits(&s->gb, 8);
7762     get_bits1(&s->gb);   //constraint_set0_flag
7763     get_bits1(&s->gb);   //constraint_set1_flag
7764     get_bits1(&s->gb);   //constraint_set2_flag
7765     get_bits1(&s->gb);   //constraint_set3_flag
7766     get_bits(&s->gb, 4); // reserved
7767     level_idc= get_bits(&s->gb, 8);
7768     sps_id= get_ue_golomb(&s->gb);
7769
7770     if (sps_id >= MAX_SPS_COUNT){
7771         // ok it has gone out of hand, someone is sending us bad stuff.
7772         av_log(h->s.avctx, AV_LOG_ERROR, "illegal sps_id (%d)\n", sps_id);
7773         return -1;
7774     }
7775
7776     sps= &h->sps_buffer[ sps_id ];
7777     sps->profile_idc= profile_idc;
7778     sps->level_idc= level_idc;
7779
7780     if(sps->profile_idc >= 100){ //high profile
7781         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7782             get_bits1(&s->gb);  //residual_color_transform_flag
7783         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7784         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7785         sps->transform_bypass = get_bits1(&s->gb);
7786         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7787     }else
7788         sps->scaling_matrix_present = 0;
7789
7790     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7791     sps->poc_type= get_ue_golomb(&s->gb);
7792
7793     if(sps->poc_type == 0){ //FIXME #define
7794         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7795     } else if(sps->poc_type == 1){//FIXME #define
7796         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7797         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7798         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7799         tmp= get_ue_golomb(&s->gb);
7800
7801         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7802             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7803             return -1;
7804         }
7805         sps->poc_cycle_length= tmp;
7806
7807         for(i=0; i<sps->poc_cycle_length; i++)
7808             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7809     }else if(sps->poc_type != 2){
7810         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7811         return -1;
7812     }
7813
7814     tmp= get_ue_golomb(&s->gb);
7815     if(tmp > MAX_PICTURE_COUNT-2){
7816         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7817     }
7818     sps->ref_frame_count= tmp;
7819     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7820     mb_width= get_ue_golomb(&s->gb) + 1;
7821     mb_height= get_ue_golomb(&s->gb) + 1;
7822     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7823        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7824         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7825         return -1;
7826     }
7827     sps->mb_width = mb_width;
7828     sps->mb_height= mb_height;
7829
7830     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7831     if(!sps->frame_mbs_only_flag)
7832         sps->mb_aff= get_bits1(&s->gb);
7833     else
7834         sps->mb_aff= 0;
7835
7836     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7837
7838 #ifndef ALLOW_INTERLACE
7839     if(sps->mb_aff)
7840         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7841 #endif
7842     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7843         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7844
7845     sps->crop= get_bits1(&s->gb);
7846     if(sps->crop){
7847         sps->crop_left  = get_ue_golomb(&s->gb);
7848         sps->crop_right = get_ue_golomb(&s->gb);
7849         sps->crop_top   = get_ue_golomb(&s->gb);
7850         sps->crop_bottom= get_ue_golomb(&s->gb);
7851         if(sps->crop_left || sps->crop_top){
7852             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7853         }
7854     }else{
7855         sps->crop_left  =
7856         sps->crop_right =
7857         sps->crop_top   =
7858         sps->crop_bottom= 0;
7859     }
7860
7861     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7862     if( sps->vui_parameters_present_flag )
7863         decode_vui_parameters(h, sps);
7864
7865     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7866         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7867                sps_id, sps->profile_idc, sps->level_idc,
7868                sps->poc_type,
7869                sps->ref_frame_count,
7870                sps->mb_width, sps->mb_height,
7871                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7872                sps->direct_8x8_inference_flag ? "8B8" : "",
7873                sps->crop_left, sps->crop_right,
7874                sps->crop_top, sps->crop_bottom,
7875                sps->vui_parameters_present_flag ? "VUI" : ""
7876                );
7877     }
7878     return 0;
7879 }
7880
7881 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7882     MpegEncContext * const s = &h->s;
7883     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7884     PPS *pps;
7885
7886     if(pps_id>=MAX_PPS_COUNT){
7887         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
7888         return -1;
7889     }
7890     pps = &h->pps_buffer[pps_id];
7891
7892     tmp= get_ue_golomb(&s->gb);
7893     if(tmp>=MAX_SPS_COUNT){
7894         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7895         return -1;
7896     }
7897     pps->sps_id= tmp;
7898
7899     pps->cabac= get_bits1(&s->gb);
7900     pps->pic_order_present= get_bits1(&s->gb);
7901     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7902     if(pps->slice_group_count > 1 ){
7903         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7904         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7905         switch(pps->mb_slice_group_map_type){
7906         case 0:
7907 #if 0
7908 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7909 |    run_length[ i ]                                |1  |ue(v)   |
7910 #endif
7911             break;
7912         case 2:
7913 #if 0
7914 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7915 |{                                                  |   |        |
7916 |    top_left_mb[ i ]                               |1  |ue(v)   |
7917 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7918 |   }                                               |   |        |
7919 #endif
7920             break;
7921         case 3:
7922         case 4:
7923         case 5:
7924 #if 0
7925 |   slice_group_change_direction_flag               |1  |u(1)    |
7926 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7927 #endif
7928             break;
7929         case 6:
7930 #if 0
7931 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7932 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7933 |)                                                  |   |        |
7934 |    slice_group_id[ i ]                            |1  |u(v)    |
7935 #endif
7936             break;
7937         }
7938     }
7939     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7940     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7941     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7942         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7943         pps->ref_count[0]= pps->ref_count[1]= 1;
7944         return -1;
7945     }
7946
7947     pps->weighted_pred= get_bits1(&s->gb);
7948     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7949     pps->init_qp= get_se_golomb(&s->gb) + 26;
7950     pps->init_qs= get_se_golomb(&s->gb) + 26;
7951     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7952     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7953     pps->constrained_intra_pred= get_bits1(&s->gb);
7954     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7955
7956     pps->transform_8x8_mode= 0;
7957     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7958     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7959     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7960
7961     if(get_bits_count(&s->gb) < bit_length){
7962         pps->transform_8x8_mode= get_bits1(&s->gb);
7963         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7964         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7965     }
7966
7967     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7968         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7969                pps_id, pps->sps_id,
7970                pps->cabac ? "CABAC" : "CAVLC",
7971                pps->slice_group_count,
7972                pps->ref_count[0], pps->ref_count[1],
7973                pps->weighted_pred ? "weighted" : "",
7974                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7975                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7976                pps->constrained_intra_pred ? "CONSTR" : "",
7977                pps->redundant_pic_cnt_present ? "REDU" : "",
7978                pps->transform_8x8_mode ? "8x8DCT" : ""
7979                );
7980     }
7981
7982     return 0;
7983 }
7984
7985 /**
7986  * finds the end of the current frame in the bitstream.
7987  * @return the position of the first byte of the next frame, or -1
7988  */
7989 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7990     int i;
7991     uint32_t state;
7992     ParseContext *pc = &(h->s.parse_context);
7993 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7994 //    mb_addr= pc->mb_addr - 1;
7995     state= pc->state;
7996     if(state>13)
7997         state= 7;
7998
7999     for(i=0; i<buf_size; i++){
8000         if(state==7){
8001             for(; i<buf_size; i++){
8002                 if(!buf[i]){
8003                     state=2;
8004                     break;
8005                 }
8006             }
8007         }else if(state<=2){
8008             if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
8009             else if(buf[i]) state = 7;
8010             else            state>>=1; //2->1, 1->0, 0->0
8011         }else if(state<=5){
8012             int v= buf[i] & 0x1F;
8013             if(v==7 || v==8 || v==9){
8014                 if(pc->frame_start_found){
8015                     i++;
8016 found:
8017                     pc->state=7;
8018                     pc->frame_start_found= 0;
8019                     return i-(state&5);
8020                 }
8021             }else if(v==1 || v==2 || v==5){
8022                 if(pc->frame_start_found){
8023                     state+=8;
8024                     continue;
8025                 }else
8026                     pc->frame_start_found = 1;
8027             }
8028             state= 7;
8029         }else{
8030             if(buf[i] & 0x80)
8031                 goto found;
8032             state= 7;
8033         }
8034     }
8035     pc->state= state;
8036     return END_NOT_FOUND;
8037 }
8038
8039 #ifdef CONFIG_H264_PARSER
8040 static int h264_parse(AVCodecParserContext *s,
8041                       AVCodecContext *avctx,
8042                       uint8_t **poutbuf, int *poutbuf_size,
8043                       const uint8_t *buf, int buf_size)
8044 {
8045     H264Context *h = s->priv_data;
8046     ParseContext *pc = &h->s.parse_context;
8047     int next;
8048
8049     if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
8050         next= buf_size;
8051     }else{
8052         next= find_frame_end(h, buf, buf_size);
8053
8054         if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
8055             *poutbuf = NULL;
8056             *poutbuf_size = 0;
8057             return buf_size;
8058         }
8059
8060         if(next<0 && next != END_NOT_FOUND){
8061             assert(pc->last_index + next >= 0 );
8062             find_frame_end(h, &pc->buffer[pc->last_index + next], -next); //update state
8063         }
8064     }
8065
8066     *poutbuf = (uint8_t *)buf;
8067     *poutbuf_size = buf_size;
8068     return next;
8069 }
8070
8071 static int h264_split(AVCodecContext *avctx,
8072                       const uint8_t *buf, int buf_size)
8073 {
8074     int i;
8075     uint32_t state = -1;
8076     int has_sps= 0;
8077
8078     for(i=0; i<=buf_size; i++){
8079         if((state&0xFFFFFF1F) == 0x107)
8080             has_sps=1;
8081 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
8082         }*/
8083         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
8084             if(has_sps){
8085                 while(i>4 && buf[i-5]==0) i--;
8086                 return i-4;
8087             }
8088         }
8089         if (i<buf_size)
8090             state= (state<<8) | buf[i];
8091     }
8092     return 0;
8093 }
8094 #endif /* CONFIG_H264_PARSER */
8095
8096 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
8097     MpegEncContext * const s = &h->s;
8098     AVCodecContext * const avctx= s->avctx;
8099     int buf_index=0;
8100 #if 0
8101     int i;
8102     for(i=0; i<50; i++){
8103         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
8104     }
8105 #endif
8106     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
8107         h->slice_num = 0;
8108         s->current_picture_ptr= NULL;
8109     }
8110
8111     for(;;){
8112         int consumed;
8113         int dst_length;
8114         int bit_length;
8115         uint8_t *ptr;
8116         int i, nalsize = 0;
8117
8118       if(h->is_avc) {
8119         if(buf_index >= buf_size) break;
8120         nalsize = 0;
8121         for(i = 0; i < h->nal_length_size; i++)
8122             nalsize = (nalsize << 8) | buf[buf_index++];
8123         if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
8124             if(nalsize == 1){
8125                 buf_index++;
8126                 continue;
8127             }else{
8128                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
8129                 break;
8130             }
8131         }
8132       } else {
8133         // start code prefix search
8134         for(; buf_index + 3 < buf_size; buf_index++){
8135             // This should always succeed in the first iteration.
8136             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
8137                 break;
8138         }
8139
8140         if(buf_index+3 >= buf_size) break;
8141
8142         buf_index+=3;
8143       }
8144
8145         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
8146         if (ptr==NULL || dst_length < 0){
8147             return -1;
8148         }
8149         while(ptr[dst_length - 1] == 0 && dst_length > 0)
8150             dst_length--;
8151         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
8152
8153         if(s->avctx->debug&FF_DEBUG_STARTCODE){
8154             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
8155         }
8156
8157         if (h->is_avc && (nalsize != consumed))
8158             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
8159
8160         buf_index += consumed;
8161
8162         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
8163            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
8164             continue;
8165
8166         switch(h->nal_unit_type){
8167         case NAL_IDR_SLICE:
8168             idr(h); //FIXME ensure we don't loose some frames if there is reordering
8169         case NAL_SLICE:
8170             init_get_bits(&s->gb, ptr, bit_length);
8171             h->intra_gb_ptr=
8172             h->inter_gb_ptr= &s->gb;
8173             s->data_partitioning = 0;
8174
8175             if(decode_slice_header(h) < 0){
8176                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8177                 break;
8178             }
8179             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8180             if(h->redundant_pic_count==0 && s->hurry_up < 5
8181                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8182                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8183                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8184                && avctx->skip_frame < AVDISCARD_ALL)
8185                 decode_slice(h);
8186             break;
8187         case NAL_DPA:
8188             init_get_bits(&s->gb, ptr, bit_length);
8189             h->intra_gb_ptr=
8190             h->inter_gb_ptr= NULL;
8191             s->data_partitioning = 1;
8192
8193             if(decode_slice_header(h) < 0){
8194                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8195             }
8196             break;
8197         case NAL_DPB:
8198             init_get_bits(&h->intra_gb, ptr, bit_length);
8199             h->intra_gb_ptr= &h->intra_gb;
8200             break;
8201         case NAL_DPC:
8202             init_get_bits(&h->inter_gb, ptr, bit_length);
8203             h->inter_gb_ptr= &h->inter_gb;
8204
8205             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8206                && s->context_initialized
8207                && s->hurry_up < 5
8208                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8209                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8210                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8211                && avctx->skip_frame < AVDISCARD_ALL)
8212                 decode_slice(h);
8213             break;
8214         case NAL_SEI:
8215             init_get_bits(&s->gb, ptr, bit_length);
8216             decode_sei(h);
8217             break;
8218         case NAL_SPS:
8219             init_get_bits(&s->gb, ptr, bit_length);
8220             decode_seq_parameter_set(h);
8221
8222             if(s->flags& CODEC_FLAG_LOW_DELAY)
8223                 s->low_delay=1;
8224
8225             if(avctx->has_b_frames < 2)
8226                 avctx->has_b_frames= !s->low_delay;
8227             break;
8228         case NAL_PPS:
8229             init_get_bits(&s->gb, ptr, bit_length);
8230
8231             decode_picture_parameter_set(h, bit_length);
8232
8233             break;
8234         case NAL_AUD:
8235         case NAL_END_SEQUENCE:
8236         case NAL_END_STREAM:
8237         case NAL_FILLER_DATA:
8238         case NAL_SPS_EXT:
8239         case NAL_AUXILIARY_SLICE:
8240             break;
8241         default:
8242             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8243         }
8244     }
8245
8246     return buf_index;
8247 }
8248
8249 /**
8250  * returns the number of bytes consumed for building the current frame
8251  */
8252 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8253     if(s->flags&CODEC_FLAG_TRUNCATED){
8254         pos -= s->parse_context.last_index;
8255         if(pos<0) pos=0; // FIXME remove (unneeded?)
8256
8257         return pos;
8258     }else{
8259         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8260         if(pos+10>buf_size) pos=buf_size; // oops ;)
8261
8262         return pos;
8263     }
8264 }
8265
8266 static int decode_frame(AVCodecContext *avctx,
8267                              void *data, int *data_size,
8268                              uint8_t *buf, int buf_size)
8269 {
8270     H264Context *h = avctx->priv_data;
8271     MpegEncContext *s = &h->s;
8272     AVFrame *pict = data;
8273     int buf_index;
8274
8275     s->flags= avctx->flags;
8276     s->flags2= avctx->flags2;
8277
8278    /* no supplementary picture */
8279     if (buf_size == 0) {
8280         Picture *out;
8281         int i, out_idx;
8282
8283 //FIXME factorize this with the output code below
8284         out = h->delayed_pic[0];
8285         out_idx = 0;
8286         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8287             if(h->delayed_pic[i]->poc < out->poc){
8288                 out = h->delayed_pic[i];
8289                 out_idx = i;
8290             }
8291
8292         for(i=out_idx; h->delayed_pic[i]; i++)
8293             h->delayed_pic[i] = h->delayed_pic[i+1];
8294
8295         if(out){
8296             *data_size = sizeof(AVFrame);
8297             *pict= *(AVFrame*)out;
8298         }
8299
8300         return 0;
8301     }
8302
8303     if(s->flags&CODEC_FLAG_TRUNCATED){
8304         int next= find_frame_end(h, buf, buf_size);
8305
8306         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8307             return buf_size;
8308 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8309     }
8310
8311     if(h->is_avc && !h->got_avcC) {
8312         int i, cnt, nalsize;
8313         unsigned char *p = avctx->extradata;
8314         if(avctx->extradata_size < 7) {
8315             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8316             return -1;
8317         }
8318         if(*p != 1) {
8319             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8320             return -1;
8321         }
8322         /* sps and pps in the avcC always have length coded with 2 bytes,
8323            so put a fake nal_length_size = 2 while parsing them */
8324         h->nal_length_size = 2;
8325         // Decode sps from avcC
8326         cnt = *(p+5) & 0x1f; // Number of sps
8327         p += 6;
8328         for (i = 0; i < cnt; i++) {
8329             nalsize = AV_RB16(p) + 2;
8330             if(decode_nal_units(h, p, nalsize) < 0) {
8331                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8332                 return -1;
8333             }
8334             p += nalsize;
8335         }
8336         // Decode pps from avcC
8337         cnt = *(p++); // Number of pps
8338         for (i = 0; i < cnt; i++) {
8339             nalsize = AV_RB16(p) + 2;
8340             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8341                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8342                 return -1;
8343             }
8344             p += nalsize;
8345         }
8346         // Now store right nal length size, that will be use to parse all other nals
8347         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8348         // Do not reparse avcC
8349         h->got_avcC = 1;
8350     }
8351
8352     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
8353         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8354             return -1;
8355     }
8356
8357     buf_index=decode_nal_units(h, buf, buf_size);
8358     if(buf_index < 0)
8359         return -1;
8360
8361     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
8362         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
8363         return -1;
8364     }
8365
8366     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
8367         Picture *out = s->current_picture_ptr;
8368         Picture *cur = s->current_picture_ptr;
8369         Picture *prev = h->delayed_output_pic;
8370         int i, pics, cross_idr, out_of_order, out_idx;
8371
8372         s->mb_y= 0;
8373
8374         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8375         s->current_picture_ptr->pict_type= s->pict_type;
8376
8377         h->prev_frame_num_offset= h->frame_num_offset;
8378         h->prev_frame_num= h->frame_num;
8379         if(s->current_picture_ptr->reference){
8380             h->prev_poc_msb= h->poc_msb;
8381             h->prev_poc_lsb= h->poc_lsb;
8382         }
8383         if(s->current_picture_ptr->reference)
8384             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8385
8386         ff_er_frame_end(s);
8387
8388         MPV_frame_end(s);
8389
8390     //FIXME do something with unavailable reference frames
8391
8392 #if 0 //decode order
8393         *data_size = sizeof(AVFrame);
8394 #else
8395         /* Sort B-frames into display order */
8396
8397         if(h->sps.bitstream_restriction_flag
8398            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8399             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8400             s->low_delay = 0;
8401         }
8402
8403         pics = 0;
8404         while(h->delayed_pic[pics]) pics++;
8405
8406         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
8407
8408         h->delayed_pic[pics++] = cur;
8409         if(cur->reference == 0)
8410             cur->reference = 1;
8411
8412         cross_idr = 0;
8413         for(i=0; h->delayed_pic[i]; i++)
8414             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8415                 cross_idr = 1;
8416
8417         out = h->delayed_pic[0];
8418         out_idx = 0;
8419         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8420             if(h->delayed_pic[i]->poc < out->poc){
8421                 out = h->delayed_pic[i];
8422                 out_idx = i;
8423             }
8424
8425         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8426         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8427             { }
8428         else if(prev && pics <= s->avctx->has_b_frames)
8429             out = prev;
8430         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8431            || (s->low_delay &&
8432             ((!cross_idr && prev && out->poc > prev->poc + 2)
8433              || cur->pict_type == B_TYPE)))
8434         {
8435             s->low_delay = 0;
8436             s->avctx->has_b_frames++;
8437             out = prev;
8438         }
8439         else if(out_of_order)
8440             out = prev;
8441
8442         if(out_of_order || pics > s->avctx->has_b_frames){
8443             for(i=out_idx; h->delayed_pic[i]; i++)
8444                 h->delayed_pic[i] = h->delayed_pic[i+1];
8445         }
8446
8447         if(prev == out)
8448             *data_size = 0;
8449         else
8450             *data_size = sizeof(AVFrame);
8451         if(prev && prev != out && prev->reference == 1)
8452             prev->reference = 0;
8453         h->delayed_output_pic = out;
8454 #endif
8455
8456         if(out)
8457             *pict= *(AVFrame*)out;
8458         else
8459             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8460     }
8461
8462     assert(pict->data[0] || !*data_size);
8463     ff_print_debug_info(s, pict);
8464 //printf("out %d\n", (int)pict->data[0]);
8465 #if 0 //?
8466
8467     /* Return the Picture timestamp as the frame number */
8468     /* we substract 1 because it is added on utils.c    */
8469     avctx->frame_number = s->picture_number - 1;
8470 #endif
8471     return get_consumed_bytes(s, buf_index, buf_size);
8472 }
8473 #if 0
8474 static inline void fill_mb_avail(H264Context *h){
8475     MpegEncContext * const s = &h->s;
8476     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8477
8478     if(s->mb_y){
8479         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8480         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8481         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8482     }else{
8483         h->mb_avail[0]=
8484         h->mb_avail[1]=
8485         h->mb_avail[2]= 0;
8486     }
8487     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8488     h->mb_avail[4]= 1; //FIXME move out
8489     h->mb_avail[5]= 0; //FIXME move out
8490 }
8491 #endif
8492
8493 #if 0 //selftest
8494 #define COUNT 8000
8495 #define SIZE (COUNT*40)
8496 int main(){
8497     int i;
8498     uint8_t temp[SIZE];
8499     PutBitContext pb;
8500     GetBitContext gb;
8501 //    int int_temp[10000];
8502     DSPContext dsp;
8503     AVCodecContext avctx;
8504
8505     dsputil_init(&dsp, &avctx);
8506
8507     init_put_bits(&pb, temp, SIZE);
8508     printf("testing unsigned exp golomb\n");
8509     for(i=0; i<COUNT; i++){
8510         START_TIMER
8511         set_ue_golomb(&pb, i);
8512         STOP_TIMER("set_ue_golomb");
8513     }
8514     flush_put_bits(&pb);
8515
8516     init_get_bits(&gb, temp, 8*SIZE);
8517     for(i=0; i<COUNT; i++){
8518         int j, s;
8519
8520         s= show_bits(&gb, 24);
8521
8522         START_TIMER
8523         j= get_ue_golomb(&gb);
8524         if(j != i){
8525             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8526 //            return -1;
8527         }
8528         STOP_TIMER("get_ue_golomb");
8529     }
8530
8531
8532     init_put_bits(&pb, temp, SIZE);
8533     printf("testing signed exp golomb\n");
8534     for(i=0; i<COUNT; i++){
8535         START_TIMER
8536         set_se_golomb(&pb, i - COUNT/2);
8537         STOP_TIMER("set_se_golomb");
8538     }
8539     flush_put_bits(&pb);
8540
8541     init_get_bits(&gb, temp, 8*SIZE);
8542     for(i=0; i<COUNT; i++){
8543         int j, s;
8544
8545         s= show_bits(&gb, 24);
8546
8547         START_TIMER
8548         j= get_se_golomb(&gb);
8549         if(j != i - COUNT/2){
8550             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8551 //            return -1;
8552         }
8553         STOP_TIMER("get_se_golomb");
8554     }
8555
8556     printf("testing 4x4 (I)DCT\n");
8557
8558     DCTELEM block[16];
8559     uint8_t src[16], ref[16];
8560     uint64_t error= 0, max_error=0;
8561
8562     for(i=0; i<COUNT; i++){
8563         int j;
8564 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8565         for(j=0; j<16; j++){
8566             ref[j]= random()%255;
8567             src[j]= random()%255;
8568         }
8569
8570         h264_diff_dct_c(block, src, ref, 4);
8571
8572         //normalize
8573         for(j=0; j<16; j++){
8574 //            printf("%d ", block[j]);
8575             block[j]= block[j]*4;
8576             if(j&1) block[j]= (block[j]*4 + 2)/5;
8577             if(j&4) block[j]= (block[j]*4 + 2)/5;
8578         }
8579 //        printf("\n");
8580
8581         s->dsp.h264_idct_add(ref, block, 4);
8582 /*        for(j=0; j<16; j++){
8583             printf("%d ", ref[j]);
8584         }
8585         printf("\n");*/
8586
8587         for(j=0; j<16; j++){
8588             int diff= FFABS(src[j] - ref[j]);
8589
8590             error+= diff*diff;
8591             max_error= FFMAX(max_error, diff);
8592         }
8593     }
8594     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8595 #if 0
8596     printf("testing quantizer\n");
8597     for(qp=0; qp<52; qp++){
8598         for(i=0; i<16; i++)
8599             src1_block[i]= src2_block[i]= random()%255;
8600
8601     }
8602 #endif
8603     printf("Testing NAL layer\n");
8604
8605     uint8_t bitstream[COUNT];
8606     uint8_t nal[COUNT*2];
8607     H264Context h;
8608     memset(&h, 0, sizeof(H264Context));
8609
8610     for(i=0; i<COUNT; i++){
8611         int zeros= i;
8612         int nal_length;
8613         int consumed;
8614         int out_length;
8615         uint8_t *out;
8616         int j;
8617
8618         for(j=0; j<COUNT; j++){
8619             bitstream[j]= (random() % 255) + 1;
8620         }
8621
8622         for(j=0; j<zeros; j++){
8623             int pos= random() % COUNT;
8624             while(bitstream[pos] == 0){
8625                 pos++;
8626                 pos %= COUNT;
8627             }
8628             bitstream[pos]=0;
8629         }
8630
8631         START_TIMER
8632
8633         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8634         if(nal_length<0){
8635             printf("encoding failed\n");
8636             return -1;
8637         }
8638
8639         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8640
8641         STOP_TIMER("NAL")
8642
8643         if(out_length != COUNT){
8644             printf("incorrect length %d %d\n", out_length, COUNT);
8645             return -1;
8646         }
8647
8648         if(consumed != nal_length){
8649             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8650             return -1;
8651         }
8652
8653         if(memcmp(bitstream, out, COUNT)){
8654             printf("missmatch\n");
8655             return -1;
8656         }
8657     }
8658
8659     printf("Testing RBSP\n");
8660
8661
8662     return 0;
8663 }
8664 #endif
8665
8666
8667 static int decode_end(AVCodecContext *avctx)
8668 {
8669     H264Context *h = avctx->priv_data;
8670     MpegEncContext *s = &h->s;
8671
8672     av_freep(&h->rbsp_buffer);
8673     free_tables(h); //FIXME cleanup init stuff perhaps
8674     MPV_common_end(s);
8675
8676 //    memset(h, 0, sizeof(H264Context));
8677
8678     return 0;
8679 }
8680
8681
8682 AVCodec h264_decoder = {
8683     "h264",
8684     CODEC_TYPE_VIDEO,
8685     CODEC_ID_H264,
8686     sizeof(H264Context),
8687     decode_init,
8688     NULL,
8689     decode_end,
8690     decode_frame,
8691     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8692     .flush= flush_dpb,
8693 };
8694
8695 #ifdef CONFIG_H264_PARSER
8696 AVCodecParser h264_parser = {
8697     { CODEC_ID_H264 },
8698     sizeof(H264Context),
8699     NULL,
8700     h264_parse,
8701     ff_parse_close,
8702     h264_split,
8703 };
8704 #endif
8705
8706 #include "svq3.c"