git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "common.h"
  30 #include "dsputil.h"
  31 #include "avcodec.h"
  32 #include "mpegvideo.h"
  33 #include "h264data.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 #define interlaced_dct interlaced_dct_is_a_bad_name
  42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  43
  44 #define LUMA_DC_BLOCK_INDEX   25
  45 #define CHROMA_DC_BLOCK_INDEX 26
  46
  47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  48 #define COEFF_TOKEN_VLC_BITS           8
  49 #define TOTAL_ZEROS_VLC_BITS           9
  50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  51 #define RUN_VLC_BITS                   3
  52 #define RUN7_VLC_BITS                  6
  53
  54 #define MAX_SPS_COUNT 32
  55 #define MAX_PPS_COUNT 256
  56
  57 #define MAX_MMCO_COUNT 66
  58
  59 /* Compiling in interlaced support reduces the speed
  60  * of progressive decoding by about 2%. */
  61 #define ALLOW_INTERLACE
  62
  63 #ifdef ALLOW_INTERLACE
  64 #define MB_MBAFF h->mb_mbaff
  65 #define MB_FIELD h->mb_field_decoding_flag
  66 #define FRAME_MBAFF h->mb_aff_frame
  67 #else
  68 #define MB_MBAFF 0
  69 #define MB_FIELD 0
  70 #define FRAME_MBAFF 0
  71 #undef  IS_INTERLACED
  72 #define IS_INTERLACED(mb_type) 0
  73 #endif
  74
  75 /**
  76  * Sequence parameter set
  77  */
  78 typedef struct SPS{
  79
  80     int profile_idc;
  81     int level_idc;
  82     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  83     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  84     int poc_type;                      ///< pic_order_cnt_type
  85     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  86     int delta_pic_order_always_zero_flag;
  87     int offset_for_non_ref_pic;
  88     int offset_for_top_to_bottom_field;
  89     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  90     int ref_frame_count;               ///< num_ref_frames
  91     int gaps_in_frame_num_allowed_flag;
  92     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  93     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  94     int frame_mbs_only_flag;
  95     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  96     int direct_8x8_inference_flag;
  97     int crop;                   ///< frame_cropping_flag
  98     int crop_left;              ///< frame_cropping_rect_left_offset
  99     int crop_right;             ///< frame_cropping_rect_right_offset
 100     int crop_top;               ///< frame_cropping_rect_top_offset
 101     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 102     int vui_parameters_present_flag;
 103     AVRational sar;
 104     int timing_info_present_flag;
 105     uint32_t num_units_in_tick;
 106     uint32_t time_scale;
 107     int fixed_frame_rate_flag;
 108     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 109     int bitstream_restriction_flag;
 110     int num_reorder_frames;
 111     int scaling_matrix_present;
 112     uint8_t scaling_matrix4[6][16];
 113     uint8_t scaling_matrix8[2][64];
 114 }SPS;
 115
 116 /**
 117  * Picture parameter set
 118  */
 119 typedef struct PPS{
 120     int sps_id;
 121     int cabac;                  ///< entropy_coding_mode_flag
 122     int pic_order_present;      ///< pic_order_present_flag
 123     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 124     int mb_slice_group_map_type;
 125     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 126     int weighted_pred;          ///< weighted_pred_flag
 127     int weighted_bipred_idc;
 128     int init_qp;                ///< pic_init_qp_minus26 + 26
 129     int init_qs;                ///< pic_init_qs_minus26 + 26
 130     int chroma_qp_index_offset;
 131     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 132     int constrained_intra_pred; ///< constrained_intra_pred_flag
 133     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 134     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 135     uint8_t scaling_matrix4[6][16];
 136     uint8_t scaling_matrix8[2][64];
 137 }PPS;
 138
 139 /**
 140  * Memory management control operation opcode.
 141  */
 142 typedef enum MMCOOpcode{
 143     MMCO_END=0,
 144     MMCO_SHORT2UNUSED,
 145     MMCO_LONG2UNUSED,
 146     MMCO_SHORT2LONG,
 147     MMCO_SET_MAX_LONG,
 148     MMCO_RESET,
 149     MMCO_LONG,
 150 } MMCOOpcode;
 151
 152 /**
 153  * Memory management control operation.
 154  */
 155 typedef struct MMCO{
 156     MMCOOpcode opcode;
 157     int short_frame_num;
 158     int long_index;
 159 } MMCO;
 160
 161 /**
 162  * H264Context
 163  */
 164 typedef struct H264Context{
 165     MpegEncContext s;
 166     int nal_ref_idc;
 167     int nal_unit_type;
 168     uint8_t *rbsp_buffer;
 169     unsigned int rbsp_buffer_size;
 170
 171     /**
 172       * Used to parse AVC variant of h264
 173       */
 174     int is_avc; ///< this flag is != 0 if codec is avc1
 175     int got_avcC; ///< flag used to parse avcC data only once
 176     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 177
 178     int chroma_qp; //QPc
 179
 180     int prev_mb_skipped;
 181     int next_mb_skipped;
 182
 183     //prediction stuff
 184     int chroma_pred_mode;
 185     int intra16x16_pred_mode;
 186
 187     int top_mb_xy;
 188     int left_mb_xy[2];
 189
 190     int8_t intra4x4_pred_mode_cache[5*8];
 191     int8_t (*intra4x4_pred_mode)[8];
 192     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 193     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 194     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 195     void (*pred16x16[4+3])(uint8_t *src, int stride);
 196     unsigned int topleft_samples_available;
 197     unsigned int top_samples_available;
 198     unsigned int topright_samples_available;
 199     unsigned int left_samples_available;
 200     uint8_t (*top_borders[2])[16+2*8];
 201     uint8_t left_border[2*(17+2*9)];
 202
 203     /**
 204      * non zero coeff count cache.
 205      * is 64 if not available.
 206      */
 207     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 208     uint8_t (*non_zero_count)[16];
 209
 210     /**
 211      * Motion vector cache.
 212      */
 213     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 214     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 215 #define LIST_NOT_USED -1 //FIXME rename?
 216 #define PART_NOT_AVAILABLE -2
 217
 218     /**
 219      * is 1 if the specific list MV&references are set to 0,0,-2.
 220      */
 221     int mv_cache_clean[2];
 222
 223     /**
 224      * number of neighbors (top and/or left) that used 8x8 dct
 225      */
 226     int neighbor_transform_size;
 227
 228     /**
 229      * block_offset[ 0..23] for frame macroblocks
 230      * block_offset[24..47] for field macroblocks
 231      */
 232     int block_offset[2*(16+8)];
 233
 234     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 235     uint32_t *mb2b8_xy;
 236     int b_stride; //FIXME use s->b4_stride
 237     int b8_stride;
 238
 239     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 240     int mb_uvlinesize;
 241
 242     int emu_edge_width;
 243     int emu_edge_height;
 244
 245     int halfpel_flag;
 246     int thirdpel_flag;
 247
 248     int unknown_svq3_flag;
 249     int next_slice_index;
 250
 251     SPS sps_buffer[MAX_SPS_COUNT];
 252     SPS sps; ///< current sps
 253
 254     PPS pps_buffer[MAX_PPS_COUNT];
 255     /**
 256      * current pps
 257      */
 258     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 259
 260     uint32_t dequant4_buffer[6][52][16];
 261     uint32_t dequant8_buffer[2][52][64];
 262     uint32_t (*dequant4_coeff[6])[16];
 263     uint32_t (*dequant8_coeff[2])[64];
 264     int dequant_coeff_pps;     ///< reinit tables when pps changes
 265
 266     int slice_num;
 267     uint8_t *slice_table_base;
 268     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 269     int slice_type;
 270     int slice_type_fixed;
 271
 272     //interlacing specific flags
 273     int mb_aff_frame;
 274     int mb_field_decoding_flag;
 275     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 276
 277     int sub_mb_type[4];
 278
 279     //POC stuff
 280     int poc_lsb;
 281     int poc_msb;
 282     int delta_poc_bottom;
 283     int delta_poc[2];
 284     int frame_num;
 285     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 286     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 287     int frame_num_offset;         ///< for POC type 2
 288     int prev_frame_num_offset;    ///< for POC type 2
 289     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 290
 291     /**
 292      * frame_num for frames or 2*frame_num for field pics.
 293      */
 294     int curr_pic_num;
 295
 296     /**
 297      * max_frame_num or 2*max_frame_num for field pics.
 298      */
 299     int max_pic_num;
 300
 301     //Weighted pred stuff
 302     int use_weight;
 303     int use_weight_chroma;
 304     int luma_log2_weight_denom;
 305     int chroma_log2_weight_denom;
 306     int luma_weight[2][48];
 307     int luma_offset[2][48];
 308     int chroma_weight[2][48][2];
 309     int chroma_offset[2][48][2];
 310     int implicit_weight[48][48];
 311
 312     //deblock
 313     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 314     int slice_alpha_c0_offset;
 315     int slice_beta_offset;
 316
 317     int redundant_pic_count;
 318
 319     int direct_spatial_mv_pred;
 320     int dist_scale_factor[16];
 321     int dist_scale_factor_field[32];
 322     int map_col_to_list0[2][16];
 323     int map_col_to_list0_field[2][32];
 324
 325     /**
 326      * num_ref_idx_l0/1_active_minus1 + 1
 327      */
 328     int ref_count[2];            ///< counts frames or fields, depending on current mb mode
 329     Picture *short_ref[32];
 330     Picture *long_ref[32];
 331     Picture default_ref_list[2][32];
 332     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 333     Picture *delayed_pic[16]; //FIXME size?
 334     Picture *delayed_output_pic;
 335
 336     /**
 337      * memory management control operations buffer.
 338      */
 339     MMCO mmco[MAX_MMCO_COUNT];
 340     int mmco_index;
 341
 342     int long_ref_count;  ///< number of actual long term references
 343     int short_ref_count; ///< number of actual short term references
 344
 345     //data partitioning
 346     GetBitContext intra_gb;
 347     GetBitContext inter_gb;
 348     GetBitContext *intra_gb_ptr;
 349     GetBitContext *inter_gb_ptr;
 350
 351     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 352
 353     /**
 354      * Cabac
 355      */
 356     CABACContext cabac;
 357     uint8_t      cabac_state[460];
 358     int          cabac_init_idc;
 359
 360     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 361     uint16_t     *cbp_table;
 362     int cbp;
 363     int top_cbp;
 364     int left_cbp;
 365     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 366     uint8_t     *chroma_pred_mode_table;
 367     int         last_qscale_diff;
 368     int16_t     (*mvd_table[2])[2];
 369     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 370     uint8_t     *direct_table;
 371     uint8_t     direct_cache[5*8];
 372
 373     uint8_t zigzag_scan[16];
 374     uint8_t zigzag_scan8x8[64];
 375     uint8_t zigzag_scan8x8_cavlc[64];
 376     uint8_t field_scan[16];
 377     uint8_t field_scan8x8[64];
 378     uint8_t field_scan8x8_cavlc[64];
 379     const uint8_t *zigzag_scan_q0;
 380     const uint8_t *zigzag_scan8x8_q0;
 381     const uint8_t *zigzag_scan8x8_cavlc_q0;
 382     const uint8_t *field_scan_q0;
 383     const uint8_t *field_scan8x8_q0;
 384     const uint8_t *field_scan8x8_cavlc_q0;
 385
 386     int x264_build;
 387 }H264Context;
 388
 389 static VLC coeff_token_vlc[4];
 390 static VLC chroma_dc_coeff_token_vlc;
 391
 392 static VLC total_zeros_vlc[15];
 393 static VLC chroma_dc_total_zeros_vlc[3];
 394
 395 static VLC run_vlc[6];
 396 static VLC run7_vlc;
 397
 398 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 399 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 400 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 401 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 402
 403 static always_inline uint32_t pack16to32(int a, int b){
 404 #ifdef WORDS_BIGENDIAN
 405    return (b&0xFFFF) + (a<<16);
 406 #else
 407    return (a&0xFFFF) + (b<<16);
 408 #endif
 409 }
 410
 411 /**
 412  * fill a rectangle.
 413  * @param h height of the rectangle, should be a constant
 414  * @param w width of the rectangle, should be a constant
 415  * @param size the size of val (1 or 4), should be a constant
 416  */
 417 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 418     uint8_t *p= (uint8_t*)vp;
 419     assert(size==1 || size==4);
 420     assert(w<=4);
 421
 422     w      *= size;
 423     stride *= size;
 424
 425     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 426     assert((stride&(w-1))==0);
 427     if(w==2){
 428         const uint16_t v= size==4 ? val : val*0x0101;
 429         *(uint16_t*)(p + 0*stride)= v;
 430         if(h==1) return;
 431         *(uint16_t*)(p + 1*stride)= v;
 432         if(h==2) return;
 433         *(uint16_t*)(p + 2*stride)=
 434         *(uint16_t*)(p + 3*stride)= v;
 435     }else if(w==4){
 436         const uint32_t v= size==4 ? val : val*0x01010101;
 437         *(uint32_t*)(p + 0*stride)= v;
 438         if(h==1) return;
 439         *(uint32_t*)(p + 1*stride)= v;
 440         if(h==2) return;
 441         *(uint32_t*)(p + 2*stride)=
 442         *(uint32_t*)(p + 3*stride)= v;
 443     }else if(w==8){
 444     //gcc can't optimize 64bit math on x86_32
 445 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 446         const uint64_t v= val*0x0100000001ULL;
 447         *(uint64_t*)(p + 0*stride)= v;
 448         if(h==1) return;
 449         *(uint64_t*)(p + 1*stride)= v;
 450         if(h==2) return;
 451         *(uint64_t*)(p + 2*stride)=
 452         *(uint64_t*)(p + 3*stride)= v;
 453     }else if(w==16){
 454         const uint64_t v= val*0x0100000001ULL;
 455         *(uint64_t*)(p + 0+0*stride)=
 456         *(uint64_t*)(p + 8+0*stride)=
 457         *(uint64_t*)(p + 0+1*stride)=
 458         *(uint64_t*)(p + 8+1*stride)= v;
 459         if(h==2) return;
 460         *(uint64_t*)(p + 0+2*stride)=
 461         *(uint64_t*)(p + 8+2*stride)=
 462         *(uint64_t*)(p + 0+3*stride)=
 463         *(uint64_t*)(p + 8+3*stride)= v;
 464 #else
 465         *(uint32_t*)(p + 0+0*stride)=
 466         *(uint32_t*)(p + 4+0*stride)= val;
 467         if(h==1) return;
 468         *(uint32_t*)(p + 0+1*stride)=
 469         *(uint32_t*)(p + 4+1*stride)= val;
 470         if(h==2) return;
 471         *(uint32_t*)(p + 0+2*stride)=
 472         *(uint32_t*)(p + 4+2*stride)=
 473         *(uint32_t*)(p + 0+3*stride)=
 474         *(uint32_t*)(p + 4+3*stride)= val;
 475     }else if(w==16){
 476         *(uint32_t*)(p + 0+0*stride)=
 477         *(uint32_t*)(p + 4+0*stride)=
 478         *(uint32_t*)(p + 8+0*stride)=
 479         *(uint32_t*)(p +12+0*stride)=
 480         *(uint32_t*)(p + 0+1*stride)=
 481         *(uint32_t*)(p + 4+1*stride)=
 482         *(uint32_t*)(p + 8+1*stride)=
 483         *(uint32_t*)(p +12+1*stride)= val;
 484         if(h==2) return;
 485         *(uint32_t*)(p + 0+2*stride)=
 486         *(uint32_t*)(p + 4+2*stride)=
 487         *(uint32_t*)(p + 8+2*stride)=
 488         *(uint32_t*)(p +12+2*stride)=
 489         *(uint32_t*)(p + 0+3*stride)=
 490         *(uint32_t*)(p + 4+3*stride)=
 491         *(uint32_t*)(p + 8+3*stride)=
 492         *(uint32_t*)(p +12+3*stride)= val;
 493 #endif
 494     }else
 495         assert(0);
 496     assert(h==4);
 497 }
 498
 499 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 500     MpegEncContext * const s = &h->s;
 501     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 502     int topleft_xy, top_xy, topright_xy, left_xy[2];
 503     int topleft_type, top_type, topright_type, left_type[2];
 504     int left_block[8];
 505     int i;
 506
 507     //FIXME deblocking could skip the intra and nnz parts.
 508     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 509         return;
 510
 511     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 512
 513     top_xy     = mb_xy  - s->mb_stride;
 514     topleft_xy = top_xy - 1;
 515     topright_xy= top_xy + 1;
 516     left_xy[1] = left_xy[0] = mb_xy-1;
 517     left_block[0]= 0;
 518     left_block[1]= 1;
 519     left_block[2]= 2;
 520     left_block[3]= 3;
 521     left_block[4]= 7;
 522     left_block[5]= 10;
 523     left_block[6]= 8;
 524     left_block[7]= 11;
 525     if(FRAME_MBAFF){
 526         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 527         const int top_pair_xy      = pair_xy     - s->mb_stride;
 528         const int topleft_pair_xy  = top_pair_xy - 1;
 529         const int topright_pair_xy = top_pair_xy + 1;
 530         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 531         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 532         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 533         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 534         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 535         const int bottom = (s->mb_y & 1);
 536         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 537         if (bottom
 538                 ? !curr_mb_frame_flag // bottom macroblock
 539                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 540                 ) {
 541             top_xy -= s->mb_stride;
 542         }
 543         if (bottom
 544                 ? !curr_mb_frame_flag // bottom macroblock
 545                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 546                 ) {
 547             topleft_xy -= s->mb_stride;
 548         }
 549         if (bottom
 550                 ? !curr_mb_frame_flag // bottom macroblock
 551                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 552                 ) {
 553             topright_xy -= s->mb_stride;
 554         }
 555         if (left_mb_frame_flag != curr_mb_frame_flag) {
 556             left_xy[1] = left_xy[0] = pair_xy - 1;
 557             if (curr_mb_frame_flag) {
 558                 if (bottom) {
 559                     left_block[0]= 2;
 560                     left_block[1]= 2;
 561                     left_block[2]= 3;
 562                     left_block[3]= 3;
 563                     left_block[4]= 8;
 564                     left_block[5]= 11;
 565                     left_block[6]= 8;
 566                     left_block[7]= 11;
 567                 } else {
 568                     left_block[0]= 0;
 569                     left_block[1]= 0;
 570                     left_block[2]= 1;
 571                     left_block[3]= 1;
 572                     left_block[4]= 7;
 573                     left_block[5]= 10;
 574                     left_block[6]= 7;
 575                     left_block[7]= 10;
 576                 }
 577             } else {
 578                 left_xy[1] += s->mb_stride;
 579                 //left_block[0]= 0;
 580                 left_block[1]= 2;
 581                 left_block[2]= 0;
 582                 left_block[3]= 2;
 583                 //left_block[4]= 7;
 584                 left_block[5]= 10;
 585                 left_block[6]= 7;
 586                 left_block[7]= 10;
 587             }
 588         }
 589     }
 590
 591     h->top_mb_xy = top_xy;
 592     h->left_mb_xy[0] = left_xy[0];
 593     h->left_mb_xy[1] = left_xy[1];
 594     if(for_deblock){
 595         topleft_type = 0;
 596         topright_type = 0;
 597         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 598         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 599         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 600
 601         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 602             int list;
 603             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 604             for(i=0; i<16; i++)
 605                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 606             for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 607                 if(USES_LIST(mb_type,list)){
 608                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 609                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 610                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 611                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 612                         dst[0] = src[0];
 613                         dst[1] = src[1];
 614                         dst[2] = src[2];
 615                         dst[3] = src[3];
 616                     }
 617                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 618                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 619                     ref += h->b8_stride;
 620                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 621                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 622                 }else{
 623                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 624                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 625                 }
 626             }
 627         }
 628     }else{
 629         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 630         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 631         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 632         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 633         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 634     }
 635
 636     if(IS_INTRA(mb_type)){
 637         h->topleft_samples_available=
 638         h->top_samples_available=
 639         h->left_samples_available= 0xFFFF;
 640         h->topright_samples_available= 0xEEEA;
 641
 642         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 643             h->topleft_samples_available= 0xB3FF;
 644             h->top_samples_available= 0x33FF;
 645             h->topright_samples_available= 0x26EA;
 646         }
 647         for(i=0; i<2; i++){
 648             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 649                 h->topleft_samples_available&= 0xDF5F;
 650                 h->left_samples_available&= 0x5F5F;
 651             }
 652         }
 653
 654         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 655             h->topleft_samples_available&= 0x7FFF;
 656
 657         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 658             h->topright_samples_available&= 0xFBFF;
 659
 660         if(IS_INTRA4x4(mb_type)){
 661             if(IS_INTRA4x4(top_type)){
 662                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 663                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 664                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 665                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 666             }else{
 667                 int pred;
 668                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 669                     pred= -1;
 670                 else{
 671                     pred= 2;
 672                 }
 673                 h->intra4x4_pred_mode_cache[4+8*0]=
 674                 h->intra4x4_pred_mode_cache[5+8*0]=
 675                 h->intra4x4_pred_mode_cache[6+8*0]=
 676                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 677             }
 678             for(i=0; i<2; i++){
 679                 if(IS_INTRA4x4(left_type[i])){
 680                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 681                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 682                 }else{
 683                     int pred;
 684                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 685                         pred= -1;
 686                     else{
 687                         pred= 2;
 688                     }
 689                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 690                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 691                 }
 692             }
 693         }
 694     }
 695
 696
 697 /*
 698 0 . T T. T T T T
 699 1 L . .L . . . .
 700 2 L . .L . . . .
 701 3 . T TL . . . .
 702 4 L . .L . . . .
 703 5 L . .. . . . .
 704 */
 705 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 706     if(top_type){
 707         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 708         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 709         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 710         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 711
 712         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 713         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 714
 715         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 716         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 717
 718     }else{
 719         h->non_zero_count_cache[4+8*0]=
 720         h->non_zero_count_cache[5+8*0]=
 721         h->non_zero_count_cache[6+8*0]=
 722         h->non_zero_count_cache[7+8*0]=
 723
 724         h->non_zero_count_cache[1+8*0]=
 725         h->non_zero_count_cache[2+8*0]=
 726
 727         h->non_zero_count_cache[1+8*3]=
 728         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 729
 730     }
 731
 732     for (i=0; i<2; i++) {
 733         if(left_type[i]){
 734             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 735             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 736             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 737             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 738         }else{
 739             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 740             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 741             h->non_zero_count_cache[0+8*1 +   8*i]=
 742             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 743         }
 744     }
 745
 746     if( h->pps.cabac ) {
 747         // top_cbp
 748         if(top_type) {
 749             h->top_cbp = h->cbp_table[top_xy];
 750         } else if(IS_INTRA(mb_type)) {
 751             h->top_cbp = 0x1C0;
 752         } else {
 753             h->top_cbp = 0;
 754         }
 755         // left_cbp
 756         if (left_type[0]) {
 757             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 758         } else if(IS_INTRA(mb_type)) {
 759             h->left_cbp = 0x1C0;
 760         } else {
 761             h->left_cbp = 0;
 762         }
 763         if (left_type[0]) {
 764             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 765         }
 766         if (left_type[1]) {
 767             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 768         }
 769     }
 770
 771 #if 1
 772     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 773         int list;
 774         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 775             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 776                 /*if(!h->mv_cache_clean[list]){
 777                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 778                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 779                     h->mv_cache_clean[list]= 1;
 780                 }*/
 781                 continue;
 782             }
 783             h->mv_cache_clean[list]= 0;
 784
 785             if(USES_LIST(top_type, list)){
 786                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 787                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 788                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 789                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 790                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 791                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 792                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 793                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 794                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 795                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 796             }else{
 797                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 798                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 799                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 800                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 801                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 802             }
 803
 804             //FIXME unify cleanup or sth
 805             if(USES_LIST(left_type[0], list)){
 806                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 807                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 808                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 809                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 810                 h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 811                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
 812             }else{
 813                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 814                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 815                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 816                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 817             }
 818
 819             if(USES_LIST(left_type[1], list)){
 820                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 821                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 822                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 823                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 824                 h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 825                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
 826             }else{
 827                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 828                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 829                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 830                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 831                 assert((!left_type[0]) == (!left_type[1]));
 832             }
 833
 834             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 835                 continue;
 836
 837             if(USES_LIST(topleft_type, list)){
 838                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 839                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 840                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 841                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 842             }else{
 843                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 844                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 845             }
 846
 847             if(USES_LIST(topright_type, list)){
 848                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 849                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 850                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 851                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 852             }else{
 853                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 854                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 855             }
 856
 857             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 858                 continue;
 859
 860             h->ref_cache[list][scan8[5 ]+1] =
 861             h->ref_cache[list][scan8[7 ]+1] =
 862             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 863             h->ref_cache[list][scan8[4 ]] =
 864             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 865             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 866             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 867             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 868             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 869             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 870
 871             if( h->pps.cabac ) {
 872                 /* XXX beurk, Load mvd */
 873                 if(USES_LIST(top_type, list)){
 874                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 875                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 876                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 877                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 878                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 879                 }else{
 880                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 881                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 882                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 883                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 884                 }
 885                 if(USES_LIST(left_type[0], list)){
 886                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 887                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 888                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 889                 }else{
 890                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 891                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 892                 }
 893                 if(USES_LIST(left_type[1], list)){
 894                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 895                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 896                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 897                 }else{
 898                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 899                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 900                 }
 901                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 902                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 903                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 904                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 905                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 906
 907                 if(h->slice_type == B_TYPE){
 908                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 909
 910                     if(IS_DIRECT(top_type)){
 911                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 912                     }else if(IS_8X8(top_type)){
 913                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 914                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 915                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 916                     }else{
 917                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 918                     }
 919
 920                     if(IS_DIRECT(left_type[0]))
 921                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 922                     else if(IS_8X8(left_type[0]))
 923                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 924                     else
 925                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 926
 927                     if(IS_DIRECT(left_type[1]))
 928                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 929                     else if(IS_8X8(left_type[1]))
 930                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 931                     else
 932                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 933                 }
 934             }
 935
 936             if(FRAME_MBAFF){
 937 #define MAP_MVS\
 938                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 939                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 940                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 941                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 942                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 943                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 944                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 945                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 946                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 947                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 948                 if(MB_FIELD){
 949 #define MAP_F2F(idx, mb_type)\
 950                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 951                         h->ref_cache[list][idx] <<= 1;\
 952                         h->mv_cache[list][idx][1] /= 2;\
 953                         h->mvd_cache[list][idx][1] /= 2;\
 954                     }
 955                     MAP_MVS
 956 #undef MAP_F2F
 957                 }else{
 958 #define MAP_F2F(idx, mb_type)\
 959                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 960                         h->ref_cache[list][idx] >>= 1;\
 961                         h->mv_cache[list][idx][1] <<= 1;\
 962                         h->mvd_cache[list][idx][1] <<= 1;\
 963                     }
 964                     MAP_MVS
 965 #undef MAP_F2F
 966                 }
 967             }
 968         }
 969     }
 970 #endif
 971
 972     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 973 }
 974
 975 static inline void write_back_intra_pred_mode(H264Context *h){
 976     MpegEncContext * const s = &h->s;
 977     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 978
 979     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 980     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 981     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 982     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 983     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 984     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 985     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 986 }
 987
 988 /**
 989  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 990  */
 991 static inline int check_intra4x4_pred_mode(H264Context *h){
 992     MpegEncContext * const s = &h->s;
 993     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 994     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 995     int i;
 996
 997     if(!(h->top_samples_available&0x8000)){
 998         for(i=0; i<4; i++){
 999             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1000             if(status<0){
1001                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1002                 return -1;
1003             } else if(status){
1004                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1005             }
1006         }
1007     }
1008
1009     if(!(h->left_samples_available&0x8000)){
1010         for(i=0; i<4; i++){
1011             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1012             if(status<0){
1013                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1014                 return -1;
1015             } else if(status){
1016                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1017             }
1018         }
1019     }
1020
1021     return 0;
1022 } //FIXME cleanup like next
1023
1024 /**
1025  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1026  */
1027 static inline int check_intra_pred_mode(H264Context *h, int mode){
1028     MpegEncContext * const s = &h->s;
1029     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1030     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1031
1032     if(mode < 0 || mode > 6) {
1033         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1034         return -1;
1035     }
1036
1037     if(!(h->top_samples_available&0x8000)){
1038         mode= top[ mode ];
1039         if(mode<0){
1040             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1041             return -1;
1042         }
1043     }
1044
1045     if(!(h->left_samples_available&0x8000)){
1046         mode= left[ mode ];
1047         if(mode<0){
1048             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1049             return -1;
1050         }
1051     }
1052
1053     return mode;
1054 }
1055
1056 /**
1057  * gets the predicted intra4x4 prediction mode.
1058  */
1059 static inline int pred_intra_mode(H264Context *h, int n){
1060     const int index8= scan8[n];
1061     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1062     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1063     const int min= FFMIN(left, top);
1064
1065     tprintf("mode:%d %d min:%d\n", left ,top, min);
1066
1067     if(min<0) return DC_PRED;
1068     else      return min;
1069 }
1070
1071 static inline void write_back_non_zero_count(H264Context *h){
1072     MpegEncContext * const s = &h->s;
1073     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1074
1075     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1076     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1077     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1078     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1079     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1080     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1081     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1082
1083     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1084     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1085     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1086
1087     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1088     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1089     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1090
1091     if(FRAME_MBAFF){
1092         // store all luma nnzs, for deblocking
1093         int v = 0, i;
1094         for(i=0; i<16; i++)
1095             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1096         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1097     }
1098 }
1099
1100 /**
1101  * gets the predicted number of non zero coefficients.
1102  * @param n block index
1103  */
1104 static inline int pred_non_zero_count(H264Context *h, int n){
1105     const int index8= scan8[n];
1106     const int left= h->non_zero_count_cache[index8 - 1];
1107     const int top = h->non_zero_count_cache[index8 - 8];
1108     int i= left + top;
1109
1110     if(i<64) i= (i+1)>>1;
1111
1112     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1113
1114     return i&31;
1115 }
1116
1117 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1118     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1119
1120     /* there is no consistent mapping of mvs to neighboring locations that will
1121      * make mbaff happy, so we can't move all this logic to fill_caches */
1122     if(FRAME_MBAFF){
1123         MpegEncContext *s = &h->s;
1124         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1125         const int16_t *mv;
1126         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1127         *C = h->mv_cache[list][scan8[0]-2];
1128
1129         if(!MB_FIELD
1130            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1131             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1132             if(IS_INTERLACED(mb_types[topright_xy])){
1133 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1134                 const int x4 = X4, y4 = Y4;\
1135                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1136                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1137                     return LIST_NOT_USED;\
1138                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1139                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1140                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1141                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1142
1143                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1144             }
1145         }
1146         if(topright_ref == PART_NOT_AVAILABLE
1147            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1148            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1149             if(!MB_FIELD
1150                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1151                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1152             }
1153             if(MB_FIELD
1154                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1155                && i >= scan8[0]+8){
1156                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1157                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1158             }
1159         }
1160 #undef SET_DIAG_MV
1161     }
1162
1163     if(topright_ref != PART_NOT_AVAILABLE){
1164         *C= h->mv_cache[list][ i - 8 + part_width ];
1165         return topright_ref;
1166     }else{
1167         tprintf("topright MV not available\n");
1168
1169         *C= h->mv_cache[list][ i - 8 - 1 ];
1170         return h->ref_cache[list][ i - 8 - 1 ];
1171     }
1172 }
1173
1174 /**
1175  * gets the predicted MV.
1176  * @param n the block index
1177  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1178  * @param mx the x component of the predicted motion vector
1179  * @param my the y component of the predicted motion vector
1180  */
1181 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1182     const int index8= scan8[n];
1183     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1184     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1185     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1186     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1187     const int16_t * C;
1188     int diagonal_ref, match_count;
1189
1190     assert(part_width==1 || part_width==2 || part_width==4);
1191
1192 /* mv_cache
1193   B . . A T T T T
1194   U . . L . . , .
1195   U . . L . . . .
1196   U . . L . . , .
1197   . . . L . . . .
1198 */
1199
1200     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1201     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1202     tprintf("pred_motion match_count=%d\n", match_count);
1203     if(match_count > 1){ //most common
1204         *mx= mid_pred(A[0], B[0], C[0]);
1205         *my= mid_pred(A[1], B[1], C[1]);
1206     }else if(match_count==1){
1207         if(left_ref==ref){
1208             *mx= A[0];
1209             *my= A[1];
1210         }else if(top_ref==ref){
1211             *mx= B[0];
1212             *my= B[1];
1213         }else{
1214             *mx= C[0];
1215             *my= C[1];
1216         }
1217     }else{
1218         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1219             *mx= A[0];
1220             *my= A[1];
1221         }else{
1222             *mx= mid_pred(A[0], B[0], C[0]);
1223             *my= mid_pred(A[1], B[1], C[1]);
1224         }
1225     }
1226
1227     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1228 }
1229
1230 /**
1231  * gets the directionally predicted 16x8 MV.
1232  * @param n the block index
1233  * @param mx the x component of the predicted motion vector
1234  * @param my the y component of the predicted motion vector
1235  */
1236 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1237     if(n==0){
1238         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1239         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1240
1241         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1242
1243         if(top_ref == ref){
1244             *mx= B[0];
1245             *my= B[1];
1246             return;
1247         }
1248     }else{
1249         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1250         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1251
1252         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1253
1254         if(left_ref == ref){
1255             *mx= A[0];
1256             *my= A[1];
1257             return;
1258         }
1259     }
1260
1261     //RARE
1262     pred_motion(h, n, 4, list, ref, mx, my);
1263 }
1264
1265 /**
1266  * gets the directionally predicted 8x16 MV.
1267  * @param n the block index
1268  * @param mx the x component of the predicted motion vector
1269  * @param my the y component of the predicted motion vector
1270  */
1271 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1272     if(n==0){
1273         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1274         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1275
1276         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1277
1278         if(left_ref == ref){
1279             *mx= A[0];
1280             *my= A[1];
1281             return;
1282         }
1283     }else{
1284         const int16_t * C;
1285         int diagonal_ref;
1286
1287         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1288
1289         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1290
1291         if(diagonal_ref == ref){
1292             *mx= C[0];
1293             *my= C[1];
1294             return;
1295         }
1296     }
1297
1298     //RARE
1299     pred_motion(h, n, 2, list, ref, mx, my);
1300 }
1301
1302 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1303     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1304     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1305
1306     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1307
1308     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1309        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1310        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1311
1312         *mx = *my = 0;
1313         return;
1314     }
1315
1316     pred_motion(h, 0, 4, 0, 0, mx, my);
1317
1318     return;
1319 }
1320
1321 static inline void direct_dist_scale_factor(H264Context * const h){
1322     const int poc = h->s.current_picture_ptr->poc;
1323     const int poc1 = h->ref_list[1][0].poc;
1324     int i;
1325     for(i=0; i<h->ref_count[0]; i++){
1326         int poc0 = h->ref_list[0][i].poc;
1327         int td = clip(poc1 - poc0, -128, 127);
1328         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1329             h->dist_scale_factor[i] = 256;
1330         }else{
1331             int tb = clip(poc - poc0, -128, 127);
1332             int tx = (16384 + (FFABS(td) >> 1)) / td;
1333             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1334         }
1335     }
1336     if(FRAME_MBAFF){
1337         for(i=0; i<h->ref_count[0]; i++){
1338             h->dist_scale_factor_field[2*i] =
1339             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1340         }
1341     }
1342 }
1343 static inline void direct_ref_list_init(H264Context * const h){
1344     MpegEncContext * const s = &h->s;
1345     Picture * const ref1 = &h->ref_list[1][0];
1346     Picture * const cur = s->current_picture_ptr;
1347     int list, i, j;
1348     if(cur->pict_type == I_TYPE)
1349         cur->ref_count[0] = 0;
1350     if(cur->pict_type != B_TYPE)
1351         cur->ref_count[1] = 0;
1352     for(list=0; list<2; list++){
1353         cur->ref_count[list] = h->ref_count[list];
1354         for(j=0; j<h->ref_count[list]; j++)
1355             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1356     }
1357     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1358         return;
1359     for(list=0; list<2; list++){
1360         for(i=0; i<ref1->ref_count[list]; i++){
1361             const int poc = ref1->ref_poc[list][i];
1362             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1363             for(j=0; j<h->ref_count[list]; j++)
1364                 if(h->ref_list[list][j].poc == poc){
1365                     h->map_col_to_list0[list][i] = j;
1366                     break;
1367                 }
1368         }
1369     }
1370     if(FRAME_MBAFF){
1371         for(list=0; list<2; list++){
1372             for(i=0; i<ref1->ref_count[list]; i++){
1373                 j = h->map_col_to_list0[list][i];
1374                 h->map_col_to_list0_field[list][2*i] = 2*j;
1375                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1376             }
1377         }
1378     }
1379 }
1380
1381 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1382     MpegEncContext * const s = &h->s;
1383     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1384     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1385     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1386     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1387     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1388     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1389     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1390     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1391     const int is_b8x8 = IS_8X8(*mb_type);
1392     int sub_mb_type;
1393     int i8, i4;
1394
1395 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1396     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1397         /* FIXME save sub mb types from previous frames (or derive from MVs)
1398          * so we know exactly what block size to use */
1399         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1400         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1401     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1402         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1403         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1404     }else{
1405         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1406         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1407     }
1408     if(!is_b8x8)
1409         *mb_type |= MB_TYPE_DIRECT2;
1410     if(MB_FIELD)
1411         *mb_type |= MB_TYPE_INTERLACED;
1412
1413     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1414
1415     if(h->direct_spatial_mv_pred){
1416         int ref[2];
1417         int mv[2][2];
1418         int list;
1419
1420         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1421
1422         /* ref = min(neighbors) */
1423         for(list=0; list<2; list++){
1424             int refa = h->ref_cache[list][scan8[0] - 1];
1425             int refb = h->ref_cache[list][scan8[0] - 8];
1426             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1427             if(refc == -2)
1428                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1429             ref[list] = refa;
1430             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1431                 ref[list] = refb;
1432             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1433                 ref[list] = refc;
1434             if(ref[list] < 0)
1435                 ref[list] = -1;
1436         }
1437
1438         if(ref[0] < 0 && ref[1] < 0){
1439             ref[0] = ref[1] = 0;
1440             mv[0][0] = mv[0][1] =
1441             mv[1][0] = mv[1][1] = 0;
1442         }else{
1443             for(list=0; list<2; list++){
1444                 if(ref[list] >= 0)
1445                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1446                 else
1447                     mv[list][0] = mv[list][1] = 0;
1448             }
1449         }
1450
1451         if(ref[1] < 0){
1452             *mb_type &= ~MB_TYPE_P0L1;
1453             sub_mb_type &= ~MB_TYPE_P0L1;
1454         }else if(ref[0] < 0){
1455             *mb_type &= ~MB_TYPE_P0L0;
1456             sub_mb_type &= ~MB_TYPE_P0L0;
1457         }
1458
1459         if(IS_16X16(*mb_type)){
1460             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1461             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1462             if(!IS_INTRA(mb_type_col)
1463                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1464                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1465                        && (h->x264_build>33 || !h->x264_build)))){
1466                 if(ref[0] > 0)
1467                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1468                 else
1469                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1470                 if(ref[1] > 0)
1471                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1472                 else
1473                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1474             }else{
1475                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1476                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1477             }
1478         }else{
1479             for(i8=0; i8<4; i8++){
1480                 const int x8 = i8&1;
1481                 const int y8 = i8>>1;
1482
1483                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1484                     continue;
1485                 h->sub_mb_type[i8] = sub_mb_type;
1486
1487                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1488                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1489                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1490                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1491
1492                 /* col_zero_flag */
1493                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1494                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1495                                                   && (h->x264_build>33 || !h->x264_build)))){
1496                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1497                     if(IS_SUB_8X8(sub_mb_type)){
1498                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1499                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1500                             if(ref[0] == 0)
1501                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1502                             if(ref[1] == 0)
1503                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1504                         }
1505                     }else
1506                     for(i4=0; i4<4; i4++){
1507                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1508                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1509                             if(ref[0] == 0)
1510                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1511                             if(ref[1] == 0)
1512                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1513                         }
1514                     }
1515                 }
1516             }
1517         }
1518     }else{ /* direct temporal mv pred */
1519         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1520         const int *dist_scale_factor = h->dist_scale_factor;
1521
1522         if(FRAME_MBAFF){
1523             if(IS_INTERLACED(*mb_type)){
1524                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1525                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1526                 dist_scale_factor = h->dist_scale_factor_field;
1527             }
1528             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1529                 /* FIXME assumes direct_8x8_inference == 1 */
1530                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1531                 int mb_types_col[2];
1532                 int y_shift;
1533
1534                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1535                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1536                          | (*mb_type & MB_TYPE_INTERLACED);
1537                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1538
1539                 if(IS_INTERLACED(*mb_type)){
1540                     /* frame to field scaling */
1541                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1542                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1543                     if(s->mb_y&1){
1544                         l1ref0 -= 2*h->b8_stride;
1545                         l1ref1 -= 2*h->b8_stride;
1546                         l1mv0 -= 4*h->b_stride;
1547                         l1mv1 -= 4*h->b_stride;
1548                     }
1549                     y_shift = 0;
1550
1551                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1552                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1553                        && !is_b8x8)
1554                         *mb_type |= MB_TYPE_16x8;
1555                     else
1556                         *mb_type |= MB_TYPE_8x8;
1557                 }else{
1558                     /* field to frame scaling */
1559                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1560                      * but in MBAFF, top and bottom POC are equal */
1561                     int dy = (s->mb_y&1) ? 1 : 2;
1562                     mb_types_col[0] =
1563                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1564                     l1ref0 += dy*h->b8_stride;
1565                     l1ref1 += dy*h->b8_stride;
1566                     l1mv0 += 2*dy*h->b_stride;
1567                     l1mv1 += 2*dy*h->b_stride;
1568                     y_shift = 2;
1569
1570                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1571                        && !is_b8x8)
1572                         *mb_type |= MB_TYPE_16x16;
1573                     else
1574                         *mb_type |= MB_TYPE_8x8;
1575                 }
1576
1577                 for(i8=0; i8<4; i8++){
1578                     const int x8 = i8&1;
1579                     const int y8 = i8>>1;
1580                     int ref0, scale;
1581                     const int16_t (*l1mv)[2]= l1mv0;
1582
1583                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1584                         continue;
1585                     h->sub_mb_type[i8] = sub_mb_type;
1586
1587                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1588                     if(IS_INTRA(mb_types_col[y8])){
1589                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1590                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1591                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1592                         continue;
1593                     }
1594
1595                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1596                     if(ref0 >= 0)
1597                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1598                     else{
1599                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1600                         l1mv= l1mv1;
1601                     }
1602                     scale = dist_scale_factor[ref0];
1603                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1604
1605                     {
1606                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1607                         int my_col = (mv_col[1]<<y_shift)/2;
1608                         int mx = (scale * mv_col[0] + 128) >> 8;
1609                         int my = (scale * my_col + 128) >> 8;
1610                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1611                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1612                     }
1613                 }
1614                 return;
1615             }
1616         }
1617
1618         /* one-to-one mv scaling */
1619
1620         if(IS_16X16(*mb_type)){
1621             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1622             if(IS_INTRA(mb_type_col)){
1623                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1624                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1625                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1626             }else{
1627                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1628                                                 : map_col_to_list0[1][l1ref1[0]];
1629                 const int scale = dist_scale_factor[ref0];
1630                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1631                 int mv_l0[2];
1632                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1633                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1634                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1635                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1636                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1637             }
1638         }else{
1639             for(i8=0; i8<4; i8++){
1640                 const int x8 = i8&1;
1641                 const int y8 = i8>>1;
1642                 int ref0, scale;
1643                 const int16_t (*l1mv)[2]= l1mv0;
1644
1645                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1646                     continue;
1647                 h->sub_mb_type[i8] = sub_mb_type;
1648                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1649                 if(IS_INTRA(mb_type_col)){
1650                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1651                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1652                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1653                     continue;
1654                 }
1655
1656                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1657                 if(ref0 >= 0)
1658                     ref0 = map_col_to_list0[0][ref0];
1659                 else{
1660                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1661                     l1mv= l1mv1;
1662                 }
1663                 scale = dist_scale_factor[ref0];
1664
1665                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1666                 if(IS_SUB_8X8(sub_mb_type)){
1667                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1668                     int mx = (scale * mv_col[0] + 128) >> 8;
1669                     int my = (scale * mv_col[1] + 128) >> 8;
1670                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1671                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1672                 }else
1673                 for(i4=0; i4<4; i4++){
1674                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1675                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1676                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1677                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1678                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1679                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1680                 }
1681             }
1682         }
1683     }
1684 }
1685
1686 static inline void write_back_motion(H264Context *h, int mb_type){
1687     MpegEncContext * const s = &h->s;
1688     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1689     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1690     int list;
1691
1692     if(!USES_LIST(mb_type, 0))
1693         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1694
1695     for(list=0; list<2; list++){
1696         int y;
1697         if(!USES_LIST(mb_type, list))
1698             continue;
1699
1700         for(y=0; y<4; y++){
1701             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1702             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1703         }
1704         if( h->pps.cabac ) {
1705             if(IS_SKIP(mb_type))
1706                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1707             else
1708             for(y=0; y<4; y++){
1709                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1710                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1711             }
1712         }
1713
1714         {
1715             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1716             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1717             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1718             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1719             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1720         }
1721     }
1722
1723     if(h->slice_type == B_TYPE && h->pps.cabac){
1724         if(IS_8X8(mb_type)){
1725             uint8_t *direct_table = &h->direct_table[b8_xy];
1726             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1727             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1728             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1729         }
1730     }
1731 }
1732
1733 /**
1734  * Decodes a network abstraction layer unit.
1735  * @param consumed is the number of bytes used as input
1736  * @param length is the length of the array
1737  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1738  * @returns decoded bytes, might be src+1 if no escapes
1739  */
1740 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1741     int i, si, di;
1742     uint8_t *dst;
1743
1744 //    src[0]&0x80;                //forbidden bit
1745     h->nal_ref_idc= src[0]>>5;
1746     h->nal_unit_type= src[0]&0x1F;
1747
1748     src++; length--;
1749 #if 0
1750     for(i=0; i<length; i++)
1751         printf("%2X ", src[i]);
1752 #endif
1753     for(i=0; i+1<length; i+=2){
1754         if(src[i]) continue;
1755         if(i>0 && src[i-1]==0) i--;
1756         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1757             if(src[i+2]!=3){
1758                 /* startcode, so we must be past the end */
1759                 length=i;
1760             }
1761             break;
1762         }
1763     }
1764
1765     if(i>=length-1){ //no escaped 0
1766         *dst_length= length;
1767         *consumed= length+1; //+1 for the header
1768         return src;
1769     }
1770
1771     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1772     dst= h->rbsp_buffer;
1773
1774 //printf("decoding esc\n");
1775     si=di=0;
1776     while(si<length){
1777         //remove escapes (very rare 1:2^22)
1778         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1779             if(src[si+2]==3){ //escape
1780                 dst[di++]= 0;
1781                 dst[di++]= 0;
1782                 si+=3;
1783                 continue;
1784             }else //next start code
1785                 break;
1786         }
1787
1788         dst[di++]= src[si++];
1789     }
1790
1791     *dst_length= di;
1792     *consumed= si + 1;//+1 for the header
1793 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1794     return dst;
1795 }
1796
1797 /**
1798  * identifies the exact end of the bitstream
1799  * @return the length of the trailing, or 0 if damaged
1800  */
1801 static int decode_rbsp_trailing(uint8_t *src){
1802     int v= *src;
1803     int r;
1804
1805     tprintf("rbsp trailing %X\n", v);
1806
1807     for(r=1; r<9; r++){
1808         if(v&1) return r;
1809         v>>=1;
1810     }
1811     return 0;
1812 }
1813
1814 /**
1815  * idct tranforms the 16 dc values and dequantize them.
1816  * @param qp quantization parameter
1817  */
1818 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1819 #define stride 16
1820     int i;
1821     int temp[16]; //FIXME check if this is a good idea
1822     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1823     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1824
1825 //memset(block, 64, 2*256);
1826 //return;
1827     for(i=0; i<4; i++){
1828         const int offset= y_offset[i];
1829         const int z0= block[offset+stride*0] + block[offset+stride*4];
1830         const int z1= block[offset+stride*0] - block[offset+stride*4];
1831         const int z2= block[offset+stride*1] - block[offset+stride*5];
1832         const int z3= block[offset+stride*1] + block[offset+stride*5];
1833
1834         temp[4*i+0]= z0+z3;
1835         temp[4*i+1]= z1+z2;
1836         temp[4*i+2]= z1-z2;
1837         temp[4*i+3]= z0-z3;
1838     }
1839
1840     for(i=0; i<4; i++){
1841         const int offset= x_offset[i];
1842         const int z0= temp[4*0+i] + temp[4*2+i];
1843         const int z1= temp[4*0+i] - temp[4*2+i];
1844         const int z2= temp[4*1+i] - temp[4*3+i];
1845         const int z3= temp[4*1+i] + temp[4*3+i];
1846
1847         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1848         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1849         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1850         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1851     }
1852 }
1853
1854 #if 0
1855 /**
1856  * dct tranforms the 16 dc values.
1857  * @param qp quantization parameter ??? FIXME
1858  */
1859 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1860 //    const int qmul= dequant_coeff[qp][0];
1861     int i;
1862     int temp[16]; //FIXME check if this is a good idea
1863     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1864     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1865
1866     for(i=0; i<4; i++){
1867         const int offset= y_offset[i];
1868         const int z0= block[offset+stride*0] + block[offset+stride*4];
1869         const int z1= block[offset+stride*0] - block[offset+stride*4];
1870         const int z2= block[offset+stride*1] - block[offset+stride*5];
1871         const int z3= block[offset+stride*1] + block[offset+stride*5];
1872
1873         temp[4*i+0]= z0+z3;
1874         temp[4*i+1]= z1+z2;
1875         temp[4*i+2]= z1-z2;
1876         temp[4*i+3]= z0-z3;
1877     }
1878
1879     for(i=0; i<4; i++){
1880         const int offset= x_offset[i];
1881         const int z0= temp[4*0+i] + temp[4*2+i];
1882         const int z1= temp[4*0+i] - temp[4*2+i];
1883         const int z2= temp[4*1+i] - temp[4*3+i];
1884         const int z3= temp[4*1+i] + temp[4*3+i];
1885
1886         block[stride*0 +offset]= (z0 + z3)>>1;
1887         block[stride*2 +offset]= (z1 + z2)>>1;
1888         block[stride*8 +offset]= (z1 - z2)>>1;
1889         block[stride*10+offset]= (z0 - z3)>>1;
1890     }
1891 }
1892 #endif
1893
1894 #undef xStride
1895 #undef stride
1896
1897 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1898     const int stride= 16*2;
1899     const int xStride= 16;
1900     int a,b,c,d,e;
1901
1902     a= block[stride*0 + xStride*0];
1903     b= block[stride*0 + xStride*1];
1904     c= block[stride*1 + xStride*0];
1905     d= block[stride*1 + xStride*1];
1906
1907     e= a-b;
1908     a= a+b;
1909     b= c-d;
1910     c= c+d;
1911
1912     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1913     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1914     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1915     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1916 }
1917
1918 #if 0
1919 static void chroma_dc_dct_c(DCTELEM *block){
1920     const int stride= 16*2;
1921     const int xStride= 16;
1922     int a,b,c,d,e;
1923
1924     a= block[stride*0 + xStride*0];
1925     b= block[stride*0 + xStride*1];
1926     c= block[stride*1 + xStride*0];
1927     d= block[stride*1 + xStride*1];
1928
1929     e= a-b;
1930     a= a+b;
1931     b= c-d;
1932     c= c+d;
1933
1934     block[stride*0 + xStride*0]= (a+c);
1935     block[stride*0 + xStride*1]= (e+b);
1936     block[stride*1 + xStride*0]= (a-c);
1937     block[stride*1 + xStride*1]= (e-b);
1938 }
1939 #endif
1940
1941 /**
1942  * gets the chroma qp.
1943  */
1944 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1945
1946     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1947 }
1948
1949 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1950 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1951 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1952     int i;
1953     const int * const quant_table= quant_coeff[qscale];
1954     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1955     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1956     const unsigned int threshold2= (threshold1<<1);
1957     int last_non_zero;
1958
1959     if(seperate_dc){
1960         if(qscale<=18){
1961             //avoid overflows
1962             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1963             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1964             const unsigned int dc_threshold2= (dc_threshold1<<1);
1965
1966             int level= block[0]*quant_coeff[qscale+18][0];
1967             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1968                 if(level>0){
1969                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1970                     block[0]= level;
1971                 }else{
1972                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1973                     block[0]= -level;
1974                 }
1975 //                last_non_zero = i;
1976             }else{
1977                 block[0]=0;
1978             }
1979         }else{
1980             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1981             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1982             const unsigned int dc_threshold2= (dc_threshold1<<1);
1983
1984             int level= block[0]*quant_table[0];
1985             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1986                 if(level>0){
1987                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1988                     block[0]= level;
1989                 }else{
1990                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1991                     block[0]= -level;
1992                 }
1993 //                last_non_zero = i;
1994             }else{
1995                 block[0]=0;
1996             }
1997         }
1998         last_non_zero= 0;
1999         i=1;
2000     }else{
2001         last_non_zero= -1;
2002         i=0;
2003     }
2004
2005     for(; i<16; i++){
2006         const int j= scantable[i];
2007         int level= block[j]*quant_table[j];
2008
2009 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2010 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2011         if(((unsigned)(level+threshold1))>threshold2){
2012             if(level>0){
2013                 level= (bias + level)>>QUANT_SHIFT;
2014                 block[j]= level;
2015             }else{
2016                 level= (bias - level)>>QUANT_SHIFT;
2017                 block[j]= -level;
2018             }
2019             last_non_zero = i;
2020         }else{
2021             block[j]=0;
2022         }
2023     }
2024
2025     return last_non_zero;
2026 }
2027
2028 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2029     const uint32_t a= ((uint32_t*)(src-stride))[0];
2030     ((uint32_t*)(src+0*stride))[0]= a;
2031     ((uint32_t*)(src+1*stride))[0]= a;
2032     ((uint32_t*)(src+2*stride))[0]= a;
2033     ((uint32_t*)(src+3*stride))[0]= a;
2034 }
2035
2036 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2037     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2038     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2039     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2040     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2041 }
2042
2043 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2044     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2045                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2046
2047     ((uint32_t*)(src+0*stride))[0]=
2048     ((uint32_t*)(src+1*stride))[0]=
2049     ((uint32_t*)(src+2*stride))[0]=
2050     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2051 }
2052
2053 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2054     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2055
2056     ((uint32_t*)(src+0*stride))[0]=
2057     ((uint32_t*)(src+1*stride))[0]=
2058     ((uint32_t*)(src+2*stride))[0]=
2059     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2060 }
2061
2062 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2063     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2064
2065     ((uint32_t*)(src+0*stride))[0]=
2066     ((uint32_t*)(src+1*stride))[0]=
2067     ((uint32_t*)(src+2*stride))[0]=
2068     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2069 }
2070
2071 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2072     ((uint32_t*)(src+0*stride))[0]=
2073     ((uint32_t*)(src+1*stride))[0]=
2074     ((uint32_t*)(src+2*stride))[0]=
2075     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2076 }
2077
2078
2079 #define LOAD_TOP_RIGHT_EDGE\
2080     const int t4= topright[0];\
2081     const int t5= topright[1];\
2082     const int t6= topright[2];\
2083     const int t7= topright[3];\
2084
2085 #define LOAD_LEFT_EDGE\
2086     const int l0= src[-1+0*stride];\
2087     const int l1= src[-1+1*stride];\
2088     const int l2= src[-1+2*stride];\
2089     const int l3= src[-1+3*stride];\
2090
2091 #define LOAD_TOP_EDGE\
2092     const int t0= src[ 0-1*stride];\
2093     const int t1= src[ 1-1*stride];\
2094     const int t2= src[ 2-1*stride];\
2095     const int t3= src[ 3-1*stride];\
2096
2097 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2098     const int lt= src[-1-1*stride];
2099     LOAD_TOP_EDGE
2100     LOAD_LEFT_EDGE
2101
2102     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2103     src[0+2*stride]=
2104     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2105     src[0+1*stride]=
2106     src[1+2*stride]=
2107     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2108     src[0+0*stride]=
2109     src[1+1*stride]=
2110     src[2+2*stride]=
2111     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2112     src[1+0*stride]=
2113     src[2+1*stride]=
2114     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2115     src[2+0*stride]=
2116     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2117     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2118 }
2119
2120 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2121     LOAD_TOP_EDGE
2122     LOAD_TOP_RIGHT_EDGE
2123 //    LOAD_LEFT_EDGE
2124
2125     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2126     src[1+0*stride]=
2127     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2128     src[2+0*stride]=
2129     src[1+1*stride]=
2130     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2131     src[3+0*stride]=
2132     src[2+1*stride]=
2133     src[1+2*stride]=
2134     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2135     src[3+1*stride]=
2136     src[2+2*stride]=
2137     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2138     src[3+2*stride]=
2139     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2140     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2141 }
2142
2143 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2144     const int lt= src[-1-1*stride];
2145     LOAD_TOP_EDGE
2146     LOAD_LEFT_EDGE
2147     const __attribute__((unused)) int unu= l3;
2148
2149     src[0+0*stride]=
2150     src[1+2*stride]=(lt + t0 + 1)>>1;
2151     src[1+0*stride]=
2152     src[2+2*stride]=(t0 + t1 + 1)>>1;
2153     src[2+0*stride]=
2154     src[3+2*stride]=(t1 + t2 + 1)>>1;
2155     src[3+0*stride]=(t2 + t3 + 1)>>1;
2156     src[0+1*stride]=
2157     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2158     src[1+1*stride]=
2159     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2160     src[2+1*stride]=
2161     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2162     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2163     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2164     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2165 }
2166
2167 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2168     LOAD_TOP_EDGE
2169     LOAD_TOP_RIGHT_EDGE
2170     const __attribute__((unused)) int unu= t7;
2171
2172     src[0+0*stride]=(t0 + t1 + 1)>>1;
2173     src[1+0*stride]=
2174     src[0+2*stride]=(t1 + t2 + 1)>>1;
2175     src[2+0*stride]=
2176     src[1+2*stride]=(t2 + t3 + 1)>>1;
2177     src[3+0*stride]=
2178     src[2+2*stride]=(t3 + t4+ 1)>>1;
2179     src[3+2*stride]=(t4 + t5+ 1)>>1;
2180     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2181     src[1+1*stride]=
2182     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2183     src[2+1*stride]=
2184     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2185     src[3+1*stride]=
2186     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2187     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2188 }
2189
2190 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2191     LOAD_LEFT_EDGE
2192
2193     src[0+0*stride]=(l0 + l1 + 1)>>1;
2194     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2195     src[2+0*stride]=
2196     src[0+1*stride]=(l1 + l2 + 1)>>1;
2197     src[3+0*stride]=
2198     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2199     src[2+1*stride]=
2200     src[0+2*stride]=(l2 + l3 + 1)>>1;
2201     src[3+1*stride]=
2202     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2203     src[3+2*stride]=
2204     src[1+3*stride]=
2205     src[0+3*stride]=
2206     src[2+2*stride]=
2207     src[2+3*stride]=
2208     src[3+3*stride]=l3;
2209 }
2210
2211 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2212     const int lt= src[-1-1*stride];
2213     LOAD_TOP_EDGE
2214     LOAD_LEFT_EDGE
2215     const __attribute__((unused)) int unu= t3;
2216
2217     src[0+0*stride]=
2218     src[2+1*stride]=(lt + l0 + 1)>>1;
2219     src[1+0*stride]=
2220     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2221     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2222     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2223     src[0+1*stride]=
2224     src[2+2*stride]=(l0 + l1 + 1)>>1;
2225     src[1+1*stride]=
2226     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2227     src[0+2*stride]=
2228     src[2+3*stride]=(l1 + l2+ 1)>>1;
2229     src[1+2*stride]=
2230     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2231     src[0+3*stride]=(l2 + l3 + 1)>>1;
2232     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2233 }
2234
2235 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2236     int i;
2237     const uint32_t a= ((uint32_t*)(src-stride))[0];
2238     const uint32_t b= ((uint32_t*)(src-stride))[1];
2239     const uint32_t c= ((uint32_t*)(src-stride))[2];
2240     const uint32_t d= ((uint32_t*)(src-stride))[3];
2241
2242     for(i=0; i<16; i++){
2243         ((uint32_t*)(src+i*stride))[0]= a;
2244         ((uint32_t*)(src+i*stride))[1]= b;
2245         ((uint32_t*)(src+i*stride))[2]= c;
2246         ((uint32_t*)(src+i*stride))[3]= d;
2247     }
2248 }
2249
2250 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2251     int i;
2252
2253     for(i=0; i<16; i++){
2254         ((uint32_t*)(src+i*stride))[0]=
2255         ((uint32_t*)(src+i*stride))[1]=
2256         ((uint32_t*)(src+i*stride))[2]=
2257         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2258     }
2259 }
2260
2261 void ff_pred16x16_dc_c(uint8_t *src, int stride){
2262     int i, dc=0;
2263
2264     for(i=0;i<16; i++){
2265         dc+= src[-1+i*stride];
2266     }
2267
2268     for(i=0;i<16; i++){
2269         dc+= src[i-stride];
2270     }
2271
2272     dc= 0x01010101*((dc + 16)>>5);
2273
2274     for(i=0; i<16; i++){
2275         ((uint32_t*)(src+i*stride))[0]=
2276         ((uint32_t*)(src+i*stride))[1]=
2277         ((uint32_t*)(src+i*stride))[2]=
2278         ((uint32_t*)(src+i*stride))[3]= dc;
2279     }
2280 }
2281
2282 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2283     int i, dc=0;
2284
2285     for(i=0;i<16; i++){
2286         dc+= src[-1+i*stride];
2287     }
2288
2289     dc= 0x01010101*((dc + 8)>>4);
2290
2291     for(i=0; i<16; i++){
2292         ((uint32_t*)(src+i*stride))[0]=
2293         ((uint32_t*)(src+i*stride))[1]=
2294         ((uint32_t*)(src+i*stride))[2]=
2295         ((uint32_t*)(src+i*stride))[3]= dc;
2296     }
2297 }
2298
2299 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2300     int i, dc=0;
2301
2302     for(i=0;i<16; i++){
2303         dc+= src[i-stride];
2304     }
2305     dc= 0x01010101*((dc + 8)>>4);
2306
2307     for(i=0; i<16; i++){
2308         ((uint32_t*)(src+i*stride))[0]=
2309         ((uint32_t*)(src+i*stride))[1]=
2310         ((uint32_t*)(src+i*stride))[2]=
2311         ((uint32_t*)(src+i*stride))[3]= dc;
2312     }
2313 }
2314
2315 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2316     int i;
2317
2318     for(i=0; i<16; i++){
2319         ((uint32_t*)(src+i*stride))[0]=
2320         ((uint32_t*)(src+i*stride))[1]=
2321         ((uint32_t*)(src+i*stride))[2]=
2322         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2323     }
2324 }
2325
2326 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2327   int i, j, k;
2328   int a;
2329   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2330   const uint8_t * const src0 = src+7-stride;
2331   const uint8_t *src1 = src+8*stride-1;
2332   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2333   int H = src0[1] - src0[-1];
2334   int V = src1[0] - src2[ 0];
2335   for(k=2; k<=8; ++k) {
2336     src1 += stride; src2 -= stride;
2337     H += k*(src0[k] - src0[-k]);
2338     V += k*(src1[0] - src2[ 0]);
2339   }
2340   if(svq3){
2341     H = ( 5*(H/4) ) / 16;
2342     V = ( 5*(V/4) ) / 16;
2343
2344     /* required for 100% accuracy */
2345     i = H; H = V; V = i;
2346   }else{
2347     H = ( 5*H+32 ) >> 6;
2348     V = ( 5*V+32 ) >> 6;
2349   }
2350
2351   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2352   for(j=16; j>0; --j) {
2353     int b = a;
2354     a += V;
2355     for(i=-16; i<0; i+=4) {
2356       src[16+i] = cm[ (b    ) >> 5 ];
2357       src[17+i] = cm[ (b+  H) >> 5 ];
2358       src[18+i] = cm[ (b+2*H) >> 5 ];
2359       src[19+i] = cm[ (b+3*H) >> 5 ];
2360       b += 4*H;
2361     }
2362     src += stride;
2363   }
2364 }
2365
2366 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2367     pred16x16_plane_compat_c(src, stride, 0);
2368 }
2369
2370 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2371     int i;
2372     const uint32_t a= ((uint32_t*)(src-stride))[0];
2373     const uint32_t b= ((uint32_t*)(src-stride))[1];
2374
2375     for(i=0; i<8; i++){
2376         ((uint32_t*)(src+i*stride))[0]= a;
2377         ((uint32_t*)(src+i*stride))[1]= b;
2378     }
2379 }
2380
2381 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2382     int i;
2383
2384     for(i=0; i<8; i++){
2385         ((uint32_t*)(src+i*stride))[0]=
2386         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2387     }
2388 }
2389
2390 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2391     int i;
2392
2393     for(i=0; i<8; i++){
2394         ((uint32_t*)(src+i*stride))[0]=
2395         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2396     }
2397 }
2398
2399 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2400     int i;
2401     int dc0, dc2;
2402
2403     dc0=dc2=0;
2404     for(i=0;i<4; i++){
2405         dc0+= src[-1+i*stride];
2406         dc2+= src[-1+(i+4)*stride];
2407     }
2408     dc0= 0x01010101*((dc0 + 2)>>2);
2409     dc2= 0x01010101*((dc2 + 2)>>2);
2410
2411     for(i=0; i<4; i++){
2412         ((uint32_t*)(src+i*stride))[0]=
2413         ((uint32_t*)(src+i*stride))[1]= dc0;
2414     }
2415     for(i=4; i<8; i++){
2416         ((uint32_t*)(src+i*stride))[0]=
2417         ((uint32_t*)(src+i*stride))[1]= dc2;
2418     }
2419 }
2420
2421 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2422     int i;
2423     int dc0, dc1;
2424
2425     dc0=dc1=0;
2426     for(i=0;i<4; i++){
2427         dc0+= src[i-stride];
2428         dc1+= src[4+i-stride];
2429     }
2430     dc0= 0x01010101*((dc0 + 2)>>2);
2431     dc1= 0x01010101*((dc1 + 2)>>2);
2432
2433     for(i=0; i<4; i++){
2434         ((uint32_t*)(src+i*stride))[0]= dc0;
2435         ((uint32_t*)(src+i*stride))[1]= dc1;
2436     }
2437     for(i=4; i<8; i++){
2438         ((uint32_t*)(src+i*stride))[0]= dc0;
2439         ((uint32_t*)(src+i*stride))[1]= dc1;
2440     }
2441 }
2442
2443
2444 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2445     int i;
2446     int dc0, dc1, dc2, dc3;
2447
2448     dc0=dc1=dc2=0;
2449     for(i=0;i<4; i++){
2450         dc0+= src[-1+i*stride] + src[i-stride];
2451         dc1+= src[4+i-stride];
2452         dc2+= src[-1+(i+4)*stride];
2453     }
2454     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2455     dc0= 0x01010101*((dc0 + 4)>>3);
2456     dc1= 0x01010101*((dc1 + 2)>>2);
2457     dc2= 0x01010101*((dc2 + 2)>>2);
2458
2459     for(i=0; i<4; i++){
2460         ((uint32_t*)(src+i*stride))[0]= dc0;
2461         ((uint32_t*)(src+i*stride))[1]= dc1;
2462     }
2463     for(i=4; i<8; i++){
2464         ((uint32_t*)(src+i*stride))[0]= dc2;
2465         ((uint32_t*)(src+i*stride))[1]= dc3;
2466     }
2467 }
2468
2469 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2470   int j, k;
2471   int a;
2472   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2473   const uint8_t * const src0 = src+3-stride;
2474   const uint8_t *src1 = src+4*stride-1;
2475   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2476   int H = src0[1] - src0[-1];
2477   int V = src1[0] - src2[ 0];
2478   for(k=2; k<=4; ++k) {
2479     src1 += stride; src2 -= stride;
2480     H += k*(src0[k] - src0[-k]);
2481     V += k*(src1[0] - src2[ 0]);
2482   }
2483   H = ( 17*H+16 ) >> 5;
2484   V = ( 17*V+16 ) >> 5;
2485
2486   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2487   for(j=8; j>0; --j) {
2488     int b = a;
2489     a += V;
2490     src[0] = cm[ (b    ) >> 5 ];
2491     src[1] = cm[ (b+  H) >> 5 ];
2492     src[2] = cm[ (b+2*H) >> 5 ];
2493     src[3] = cm[ (b+3*H) >> 5 ];
2494     src[4] = cm[ (b+4*H) >> 5 ];
2495     src[5] = cm[ (b+5*H) >> 5 ];
2496     src[6] = cm[ (b+6*H) >> 5 ];
2497     src[7] = cm[ (b+7*H) >> 5 ];
2498     src += stride;
2499   }
2500 }
2501
2502 #define SRC(x,y) src[(x)+(y)*stride]
2503 #define PL(y) \
2504     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2505 #define PREDICT_8x8_LOAD_LEFT \
2506     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2507                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2508     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2509     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2510
2511 #define PT(x) \
2512     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2513 #define PREDICT_8x8_LOAD_TOP \
2514     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2515                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2516     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2517     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2518                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2519
2520 #define PTR(x) \
2521     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2522 #define PREDICT_8x8_LOAD_TOPRIGHT \
2523     int t8, t9, t10, t11, t12, t13, t14, t15; \
2524     if(has_topright) { \
2525         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2526         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2527     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2528
2529 #define PREDICT_8x8_LOAD_TOPLEFT \
2530     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2531
2532 #define PREDICT_8x8_DC(v) \
2533     int y; \
2534     for( y = 0; y < 8; y++ ) { \
2535         ((uint32_t*)src)[0] = \
2536         ((uint32_t*)src)[1] = v; \
2537         src += stride; \
2538     }
2539
2540 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2541 {
2542     PREDICT_8x8_DC(0x80808080);
2543 }
2544 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2545 {
2546     PREDICT_8x8_LOAD_LEFT;
2547     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2548     PREDICT_8x8_DC(dc);
2549 }
2550 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2551 {
2552     PREDICT_8x8_LOAD_TOP;
2553     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2554     PREDICT_8x8_DC(dc);
2555 }
2556 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2557 {
2558     PREDICT_8x8_LOAD_LEFT;
2559     PREDICT_8x8_LOAD_TOP;
2560     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2561                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2562     PREDICT_8x8_DC(dc);
2563 }
2564 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2565 {
2566     PREDICT_8x8_LOAD_LEFT;
2567 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2568                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2569     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2570 #undef ROW
2571 }
2572 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2573 {
2574     int y;
2575     PREDICT_8x8_LOAD_TOP;
2576     src[0] = t0;
2577     src[1] = t1;
2578     src[2] = t2;
2579     src[3] = t3;
2580     src[4] = t4;
2581     src[5] = t5;
2582     src[6] = t6;
2583     src[7] = t7;
2584     for( y = 1; y < 8; y++ )
2585         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2586 }
2587 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2588 {
2589     PREDICT_8x8_LOAD_TOP;
2590     PREDICT_8x8_LOAD_TOPRIGHT;
2591     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2592     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2593     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2594     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2595     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2596     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2597     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2598     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2599     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2600     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2601     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2602     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2603     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2604     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2605     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2606 }
2607 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2608 {
2609     PREDICT_8x8_LOAD_TOP;
2610     PREDICT_8x8_LOAD_LEFT;
2611     PREDICT_8x8_LOAD_TOPLEFT;
2612     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2613     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2614     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2615     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2616     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2617     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2618     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2619     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2620     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2621     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2622     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2623     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2624     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2625     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2626     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2627
2628 }
2629 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2630 {
2631     PREDICT_8x8_LOAD_TOP;
2632     PREDICT_8x8_LOAD_LEFT;
2633     PREDICT_8x8_LOAD_TOPLEFT;
2634     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2635     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2636     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2637     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2638     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2639     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2640     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2641     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2642     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2643     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2644     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2645     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2646     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2647     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2648     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2649     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2650     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2651     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2652     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2653     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2654     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2655     SRC(7,0)= (t6 + t7 + 1) >> 1;
2656 }
2657 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2658 {
2659     PREDICT_8x8_LOAD_TOP;
2660     PREDICT_8x8_LOAD_LEFT;
2661     PREDICT_8x8_LOAD_TOPLEFT;
2662     SRC(0,7)= (l6 + l7 + 1) >> 1;
2663     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2664     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2665     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2666     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2667     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2668     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2669     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2670     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2671     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2672     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2673     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2674     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2675     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2676     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2677     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2678     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2679     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2680     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2681     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2682     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2683     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2684 }
2685 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2686 {
2687     PREDICT_8x8_LOAD_TOP;
2688     PREDICT_8x8_LOAD_TOPRIGHT;
2689     SRC(0,0)= (t0 + t1 + 1) >> 1;
2690     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2691     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2692     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2693     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2694     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2695     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2696     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2697     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2698     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2699     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2700     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2701     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2702     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2703     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2704     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2705     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2706     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2707     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2708     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2709     SRC(7,6)= (t10 + t11 + 1) >> 1;
2710     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2711 }
2712 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2713 {
2714     PREDICT_8x8_LOAD_LEFT;
2715     SRC(0,0)= (l0 + l1 + 1) >> 1;
2716     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2717     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2718     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2719     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2720     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2721     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2722     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2723     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2724     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2725     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2726     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2727     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2728     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2729     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2730     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2731     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2732     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2733 }
2734 #undef PREDICT_8x8_LOAD_LEFT
2735 #undef PREDICT_8x8_LOAD_TOP
2736 #undef PREDICT_8x8_LOAD_TOPLEFT
2737 #undef PREDICT_8x8_LOAD_TOPRIGHT
2738 #undef PREDICT_8x8_DC
2739 #undef PTR
2740 #undef PT
2741 #undef PL
2742 #undef SRC
2743
2744 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2745                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2746                            int src_x_offset, int src_y_offset,
2747                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2748     MpegEncContext * const s = &h->s;
2749     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2750     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2751     const int luma_xy= (mx&3) + ((my&3)<<2);
2752     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2753     uint8_t * src_cb, * src_cr;
2754     int extra_width= h->emu_edge_width;
2755     int extra_height= h->emu_edge_height;
2756     int emu=0;
2757     const int full_mx= mx>>2;
2758     const int full_my= my>>2;
2759     const int pic_width  = 16*s->mb_width;
2760     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2761
2762     if(!pic->data[0])
2763         return;
2764
2765     if(mx&7) extra_width -= 3;
2766     if(my&7) extra_height -= 3;
2767
2768     if(   full_mx < 0-extra_width
2769        || full_my < 0-extra_height
2770        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2771        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2772         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2773             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2774         emu=1;
2775     }
2776
2777     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2778     if(!square){
2779         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2780     }
2781
2782     if(s->flags&CODEC_FLAG_GRAY) return;
2783
2784     if(MB_MBAFF){
2785         // chroma offset when predicting from a field of opposite parity
2786         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2787         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2788     }
2789     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2790     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2791
2792     if(emu){
2793         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2794             src_cb= s->edge_emu_buffer;
2795     }
2796     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2797
2798     if(emu){
2799         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2800             src_cr= s->edge_emu_buffer;
2801     }
2802     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2803 }
2804
2805 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2806                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2807                            int x_offset, int y_offset,
2808                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2809                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2810                            int list0, int list1){
2811     MpegEncContext * const s = &h->s;
2812     qpel_mc_func *qpix_op=  qpix_put;
2813     h264_chroma_mc_func chroma_op= chroma_put;
2814
2815     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2816     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2817     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2818     x_offset += 8*s->mb_x;
2819     y_offset += 8*(s->mb_y >> MB_MBAFF);
2820
2821     if(list0){
2822         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2823         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2824                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2825                            qpix_op, chroma_op);
2826
2827         qpix_op=  qpix_avg;
2828         chroma_op= chroma_avg;
2829     }
2830
2831     if(list1){
2832         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2833         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2834                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2835                            qpix_op, chroma_op);
2836     }
2837 }
2838
2839 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2840                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2841                            int x_offset, int y_offset,
2842                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2843                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2844                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2845                            int list0, int list1){
2846     MpegEncContext * const s = &h->s;
2847
2848     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2849     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2850     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2851     x_offset += 8*s->mb_x;
2852     y_offset += 8*(s->mb_y >> MB_MBAFF);
2853
2854     if(list0 && list1){
2855         /* don't optimize for luma-only case, since B-frames usually
2856          * use implicit weights => chroma too. */
2857         uint8_t *tmp_cb = s->obmc_scratchpad;
2858         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2859         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2860         int refn0 = h->ref_cache[0][ scan8[n] ];
2861         int refn1 = h->ref_cache[1][ scan8[n] ];
2862
2863         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2864                     dest_y, dest_cb, dest_cr,
2865                     x_offset, y_offset, qpix_put, chroma_put);
2866         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2867                     tmp_y, tmp_cb, tmp_cr,
2868                     x_offset, y_offset, qpix_put, chroma_put);
2869
2870         if(h->use_weight == 2){
2871             int weight0 = h->implicit_weight[refn0][refn1];
2872             int weight1 = 64 - weight0;
2873             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2874             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2875             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2876         }else{
2877             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2878                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2879                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2880             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2881                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2882                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2883             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2884                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2885                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2886         }
2887     }else{
2888         int list = list1 ? 1 : 0;
2889         int refn = h->ref_cache[list][ scan8[n] ];
2890         Picture *ref= &h->ref_list[list][refn];
2891         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2892                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2893                     qpix_put, chroma_put);
2894
2895         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2896                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2897         if(h->use_weight_chroma){
2898             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2899                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2900             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2901                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2902         }
2903     }
2904 }
2905
2906 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2907                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2908                            int x_offset, int y_offset,
2909                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2910                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2911                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2912                            int list0, int list1){
2913     if((h->use_weight==2 && list0 && list1
2914         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2915        || h->use_weight==1)
2916         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2917                          x_offset, y_offset, qpix_put, chroma_put,
2918                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2919     else
2920         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2921                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2922 }
2923
2924 static inline void prefetch_motion(H264Context *h, int list){
2925     /* fetch pixels for estimated mv 4 macroblocks ahead
2926      * optimized for 64byte cache lines */
2927     MpegEncContext * const s = &h->s;
2928     const int refn = h->ref_cache[list][scan8[0]];
2929     if(refn >= 0){
2930         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2931         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2932         uint8_t **src= h->ref_list[list][refn].data;
2933         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2934         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2935         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2936         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2937     }
2938 }
2939
2940 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2941                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2942                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2943                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2944     MpegEncContext * const s = &h->s;
2945     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2946     const int mb_type= s->current_picture.mb_type[mb_xy];
2947
2948     assert(IS_INTER(mb_type));
2949
2950     prefetch_motion(h, 0);
2951
2952     if(IS_16X16(mb_type)){
2953         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2954                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2955                 &weight_op[0], &weight_avg[0],
2956                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2957     }else if(IS_16X8(mb_type)){
2958         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2959                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2960                 &weight_op[1], &weight_avg[1],
2961                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2962         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2963                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2964                 &weight_op[1], &weight_avg[1],
2965                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2966     }else if(IS_8X16(mb_type)){
2967         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2968                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2969                 &weight_op[2], &weight_avg[2],
2970                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2971         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2972                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2973                 &weight_op[2], &weight_avg[2],
2974                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2975     }else{
2976         int i;
2977
2978         assert(IS_8X8(mb_type));
2979
2980         for(i=0; i<4; i++){
2981             const int sub_mb_type= h->sub_mb_type[i];
2982             const int n= 4*i;
2983             int x_offset= (i&1)<<2;
2984             int y_offset= (i&2)<<1;
2985
2986             if(IS_SUB_8X8(sub_mb_type)){
2987                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2988                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2989                     &weight_op[3], &weight_avg[3],
2990                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2991             }else if(IS_SUB_8X4(sub_mb_type)){
2992                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2993                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2994                     &weight_op[4], &weight_avg[4],
2995                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2996                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2997                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2998                     &weight_op[4], &weight_avg[4],
2999                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3000             }else if(IS_SUB_4X8(sub_mb_type)){
3001                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3002                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3003                     &weight_op[5], &weight_avg[5],
3004                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3005                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3006                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3007                     &weight_op[5], &weight_avg[5],
3008                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3009             }else{
3010                 int j;
3011                 assert(IS_SUB_4X4(sub_mb_type));
3012                 for(j=0; j<4; j++){
3013                     int sub_x_offset= x_offset + 2*(j&1);
3014                     int sub_y_offset= y_offset +   (j&2);
3015                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3016                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3017                         &weight_op[6], &weight_avg[6],
3018                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3019                 }
3020             }
3021         }
3022     }
3023
3024     prefetch_motion(h, 1);
3025 }
3026
3027 static void decode_init_vlc(){
3028     static int done = 0;
3029
3030     if (!done) {
3031         int i;
3032         done = 1;
3033
3034         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3035                  &chroma_dc_coeff_token_len [0], 1, 1,
3036                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3037
3038         for(i=0; i<4; i++){
3039             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3040                      &coeff_token_len [i][0], 1, 1,
3041                      &coeff_token_bits[i][0], 1, 1, 1);
3042         }
3043
3044         for(i=0; i<3; i++){
3045             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3046                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3047                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3048         }
3049         for(i=0; i<15; i++){
3050             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3051                      &total_zeros_len [i][0], 1, 1,
3052                      &total_zeros_bits[i][0], 1, 1, 1);
3053         }
3054
3055         for(i=0; i<6; i++){
3056             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3057                      &run_len [i][0], 1, 1,
3058                      &run_bits[i][0], 1, 1, 1);
3059         }
3060         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3061                  &run_len [6][0], 1, 1,
3062                  &run_bits[6][0], 1, 1, 1);
3063     }
3064 }
3065
3066 /**
3067  * Sets the intra prediction function pointers.
3068  */
3069 static void init_pred_ptrs(H264Context *h){
3070 //    MpegEncContext * const s = &h->s;
3071
3072     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3073     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3074     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3075     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3076     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3077     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3078     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3079     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3080     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3081     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3082     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3083     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3084
3085     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3086     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3087     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3088     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3089     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3090     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3091     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3092     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3093     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3094     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3095     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3096     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3097
3098     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
3099     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
3100     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
3101     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
3102     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3103     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3104     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
3105
3106     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
3107     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
3108     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
3109     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
3110     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3111     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3112     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
3113 }
3114
3115 static void free_tables(H264Context *h){
3116     av_freep(&h->intra4x4_pred_mode);
3117     av_freep(&h->chroma_pred_mode_table);
3118     av_freep(&h->cbp_table);
3119     av_freep(&h->mvd_table[0]);
3120     av_freep(&h->mvd_table[1]);
3121     av_freep(&h->direct_table);
3122     av_freep(&h->non_zero_count);
3123     av_freep(&h->slice_table_base);
3124     av_freep(&h->top_borders[1]);
3125     av_freep(&h->top_borders[0]);
3126     h->slice_table= NULL;
3127
3128     av_freep(&h->mb2b_xy);
3129     av_freep(&h->mb2b8_xy);
3130
3131     av_freep(&h->s.obmc_scratchpad);
3132 }
3133
3134 static void init_dequant8_coeff_table(H264Context *h){
3135     int i,q,x;
3136     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3137     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3138     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3139
3140     for(i=0; i<2; i++ ){
3141         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3142             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3143             break;
3144         }
3145
3146         for(q=0; q<52; q++){
3147             int shift = div6[q];
3148             int idx = rem6[q];
3149             for(x=0; x<64; x++)
3150                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3151                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3152                     h->pps.scaling_matrix8[i][x]) << shift;
3153         }
3154     }
3155 }
3156
3157 static void init_dequant4_coeff_table(H264Context *h){
3158     int i,j,q,x;
3159     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3160     for(i=0; i<6; i++ ){
3161         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3162         for(j=0; j<i; j++){
3163             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3164                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3165                 break;
3166             }
3167         }
3168         if(j<i)
3169             continue;
3170
3171         for(q=0; q<52; q++){
3172             int shift = div6[q] + 2;
3173             int idx = rem6[q];
3174             for(x=0; x<16; x++)
3175                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3176                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3177                     h->pps.scaling_matrix4[i][x]) << shift;
3178         }
3179     }
3180 }
3181
3182 static void init_dequant_tables(H264Context *h){
3183     int i,x;
3184     init_dequant4_coeff_table(h);
3185     if(h->pps.transform_8x8_mode)
3186         init_dequant8_coeff_table(h);
3187     if(h->sps.transform_bypass){
3188         for(i=0; i<6; i++)
3189             for(x=0; x<16; x++)
3190                 h->dequant4_coeff[i][0][x] = 1<<6;
3191         if(h->pps.transform_8x8_mode)
3192             for(i=0; i<2; i++)
3193                 for(x=0; x<64; x++)
3194                     h->dequant8_coeff[i][0][x] = 1<<6;
3195     }
3196 }
3197
3198
3199 /**
3200  * allocates tables.
3201  * needs width/height
3202  */
3203 static int alloc_tables(H264Context *h){
3204     MpegEncContext * const s = &h->s;
3205     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3206     int x,y;
3207
3208     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3209
3210     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3211     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3212     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3213     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3214     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3215
3216     if( h->pps.cabac ) {
3217         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3218         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3219         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3220         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3221     }
3222
3223     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3224     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3225
3226     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3227     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3228     for(y=0; y<s->mb_height; y++){
3229         for(x=0; x<s->mb_width; x++){
3230             const int mb_xy= x + y*s->mb_stride;
3231             const int b_xy = 4*x + 4*y*h->b_stride;
3232             const int b8_xy= 2*x + 2*y*h->b8_stride;
3233
3234             h->mb2b_xy [mb_xy]= b_xy;
3235             h->mb2b8_xy[mb_xy]= b8_xy;
3236         }
3237     }
3238
3239     s->obmc_scratchpad = NULL;
3240
3241     if(!h->dequant4_coeff[0])
3242         init_dequant_tables(h);
3243
3244     return 0;
3245 fail:
3246     free_tables(h);
3247     return -1;
3248 }
3249
3250 static void common_init(H264Context *h){
3251     MpegEncContext * const s = &h->s;
3252
3253     s->width = s->avctx->width;
3254     s->height = s->avctx->height;
3255     s->codec_id= s->avctx->codec->id;
3256
3257     init_pred_ptrs(h);
3258
3259     h->dequant_coeff_pps= -1;
3260     s->unrestricted_mv=1;
3261     s->decode=1; //FIXME
3262
3263     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3264     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3265 }
3266
3267 static int decode_init(AVCodecContext *avctx){
3268     H264Context *h= avctx->priv_data;
3269     MpegEncContext * const s = &h->s;
3270
3271     MPV_decode_defaults(s);
3272
3273     s->avctx = avctx;
3274     common_init(h);
3275
3276     s->out_format = FMT_H264;
3277     s->workaround_bugs= avctx->workaround_bugs;
3278
3279     // set defaults
3280 //    s->decode_mb= ff_h263_decode_mb;
3281     s->low_delay= 1;
3282     avctx->pix_fmt= PIX_FMT_YUV420P;
3283
3284     decode_init_vlc();
3285
3286     if(avctx->extradata_size > 0 && avctx->extradata &&
3287        *(char *)avctx->extradata == 1){
3288         h->is_avc = 1;
3289         h->got_avcC = 0;
3290     } else {
3291         h->is_avc = 0;
3292     }
3293
3294     return 0;
3295 }
3296
3297 static int frame_start(H264Context *h){
3298     MpegEncContext * const s = &h->s;
3299     int i;
3300
3301     if(MPV_frame_start(s, s->avctx) < 0)
3302         return -1;
3303     ff_er_frame_start(s);
3304
3305     assert(s->linesize && s->uvlinesize);
3306
3307     for(i=0; i<16; i++){
3308         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3309         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3310     }
3311     for(i=0; i<4; i++){
3312         h->block_offset[16+i]=
3313         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3314         h->block_offset[24+16+i]=
3315         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3316     }
3317
3318     /* can't be in alloc_tables because linesize isn't known there.
3319      * FIXME: redo bipred weight to not require extra buffer? */
3320     if(!s->obmc_scratchpad)
3321         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3322
3323     /* some macroblocks will be accessed before they're available */
3324     if(FRAME_MBAFF)
3325         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3326
3327 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3328     return 0;
3329 }
3330
3331 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3332     MpegEncContext * const s = &h->s;
3333     int i;
3334
3335     src_y  -=   linesize;
3336     src_cb -= uvlinesize;
3337     src_cr -= uvlinesize;
3338
3339     // There are two lines saved, the line above the the top macroblock of a pair,
3340     // and the line above the bottom macroblock
3341     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3342     for(i=1; i<17; i++){
3343         h->left_border[i]= src_y[15+i*  linesize];
3344     }
3345
3346     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3347     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3348
3349     if(!(s->flags&CODEC_FLAG_GRAY)){
3350         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3351         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3352         for(i=1; i<9; i++){
3353             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3354             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3355         }
3356         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3357         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3358     }
3359 }
3360
3361 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3362     MpegEncContext * const s = &h->s;
3363     int temp8, i;
3364     uint64_t temp64;
3365     int deblock_left = (s->mb_x > 0);
3366     int deblock_top  = (s->mb_y > 0);
3367
3368     src_y  -=   linesize + 1;
3369     src_cb -= uvlinesize + 1;
3370     src_cr -= uvlinesize + 1;
3371
3372 #define XCHG(a,b,t,xchg)\
3373 t= a;\
3374 if(xchg)\
3375     a= b;\
3376 b= t;
3377
3378     if(deblock_left){
3379         for(i = !deblock_top; i<17; i++){
3380             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3381         }
3382     }
3383
3384     if(deblock_top){
3385         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3386         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3387         if(s->mb_x+1 < s->mb_width){
3388             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3389         }
3390     }
3391
3392     if(!(s->flags&CODEC_FLAG_GRAY)){
3393         if(deblock_left){
3394             for(i = !deblock_top; i<9; i++){
3395                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3396                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3397             }
3398         }
3399         if(deblock_top){
3400             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3401             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3402         }
3403     }
3404 }
3405
3406 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3407     MpegEncContext * const s = &h->s;
3408     int i;
3409
3410     src_y  -= 2 *   linesize;
3411     src_cb -= 2 * uvlinesize;
3412     src_cr -= 2 * uvlinesize;
3413
3414     // There are two lines saved, the line above the the top macroblock of a pair,
3415     // and the line above the bottom macroblock
3416     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3417     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3418     for(i=2; i<34; i++){
3419         h->left_border[i]= src_y[15+i*  linesize];
3420     }
3421
3422     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3423     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3424     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3425     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3426
3427     if(!(s->flags&CODEC_FLAG_GRAY)){
3428         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3429         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3430         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3431         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3432         for(i=2; i<18; i++){
3433             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3434             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3435         }
3436         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3437         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3438         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3439         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3440     }
3441 }
3442
3443 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3444     MpegEncContext * const s = &h->s;
3445     int temp8, i;
3446     uint64_t temp64;
3447     int deblock_left = (s->mb_x > 0);
3448     int deblock_top  = (s->mb_y > 1);
3449
3450     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3451
3452     src_y  -= 2 *   linesize + 1;
3453     src_cb -= 2 * uvlinesize + 1;
3454     src_cr -= 2 * uvlinesize + 1;
3455
3456 #define XCHG(a,b,t,xchg)\
3457 t= a;\
3458 if(xchg)\
3459     a= b;\
3460 b= t;
3461
3462     if(deblock_left){
3463         for(i = (!deblock_top)<<1; i<34; i++){
3464             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3465         }
3466     }
3467
3468     if(deblock_top){
3469         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3470         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3471         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3472         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3473         if(s->mb_x+1 < s->mb_width){
3474             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3475             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3476         }
3477     }
3478
3479     if(!(s->flags&CODEC_FLAG_GRAY)){
3480         if(deblock_left){
3481             for(i = (!deblock_top) << 1; i<18; i++){
3482                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3483                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3484             }
3485         }
3486         if(deblock_top){
3487             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3488             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3489             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3490             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3491         }
3492     }
3493 }
3494
3495 static void hl_decode_mb(H264Context *h){
3496     MpegEncContext * const s = &h->s;
3497     const int mb_x= s->mb_x;
3498     const int mb_y= s->mb_y;
3499     const int mb_xy= mb_x + mb_y*s->mb_stride;
3500     const int mb_type= s->current_picture.mb_type[mb_xy];
3501     uint8_t  *dest_y, *dest_cb, *dest_cr;
3502     int linesize, uvlinesize /*dct_offset*/;
3503     int i;
3504     int *block_offset = &h->block_offset[0];
3505     const unsigned int bottom = mb_y & 1;
3506     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3507     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3508     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3509
3510     if(!s->decode)
3511         return;
3512
3513     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3514     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3515     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3516
3517     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3518     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3519
3520     if (MB_FIELD) {
3521         linesize   = h->mb_linesize   = s->linesize * 2;
3522         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3523         block_offset = &h->block_offset[24];
3524         if(mb_y&1){ //FIXME move out of this func?
3525             dest_y -= s->linesize*15;
3526             dest_cb-= s->uvlinesize*7;
3527             dest_cr-= s->uvlinesize*7;
3528         }
3529         if(FRAME_MBAFF) {
3530             int list;
3531             for(list=0; list<2; list++){
3532                 if(!USES_LIST(mb_type, list))
3533                     continue;
3534                 if(IS_16X16(mb_type)){
3535                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3536                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3537                 }else{
3538                     for(i=0; i<16; i+=4){
3539                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3540                         int ref = h->ref_cache[list][scan8[i]];
3541                         if(ref >= 0)
3542                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3543                     }
3544                 }
3545             }
3546         }
3547     } else {
3548         linesize   = h->mb_linesize   = s->linesize;
3549         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3550 //        dct_offset = s->linesize * 16;
3551     }
3552
3553     if(transform_bypass){
3554         idct_dc_add =
3555         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3556     }else if(IS_8x8DCT(mb_type)){
3557         idct_dc_add = s->dsp.h264_idct8_dc_add;
3558         idct_add = s->dsp.h264_idct8_add;
3559     }else{
3560         idct_dc_add = s->dsp.h264_idct_dc_add;
3561         idct_add = s->dsp.h264_idct_add;
3562     }
3563
3564     if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3565        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3566         int mbt_y = mb_y&~1;
3567         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3568         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3569         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3570         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3571     }
3572
3573     if (IS_INTRA_PCM(mb_type)) {
3574         unsigned int x, y;
3575
3576         // The pixels are stored in h->mb array in the same order as levels,
3577         // copy them in output in the correct order.
3578         for(i=0; i<16; i++) {
3579             for (y=0; y<4; y++) {
3580                 for (x=0; x<4; x++) {
3581                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3582                 }
3583             }
3584         }
3585         for(i=16; i<16+4; i++) {
3586             for (y=0; y<4; y++) {
3587                 for (x=0; x<4; x++) {
3588                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3589                 }
3590             }
3591         }
3592         for(i=20; i<20+4; i++) {
3593             for (y=0; y<4; y++) {
3594                 for (x=0; x<4; x++) {
3595                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3596                 }
3597             }
3598         }
3599     } else {
3600         if(IS_INTRA(mb_type)){
3601             if(h->deblocking_filter && !FRAME_MBAFF)
3602                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3603
3604             if(!(s->flags&CODEC_FLAG_GRAY)){
3605                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3606                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3607             }
3608
3609             if(IS_INTRA4x4(mb_type)){
3610                 if(!s->encoding){
3611                     if(IS_8x8DCT(mb_type)){
3612                         for(i=0; i<16; i+=4){
3613                             uint8_t * const ptr= dest_y + block_offset[i];
3614                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3615                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3616                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3617                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3618                             if(nnz){
3619                                 if(nnz == 1 && h->mb[i*16])
3620                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3621                                 else
3622                                     idct_add(ptr, h->mb + i*16, linesize);
3623                             }
3624                         }
3625                     }else
3626                     for(i=0; i<16; i++){
3627                         uint8_t * const ptr= dest_y + block_offset[i];
3628                         uint8_t *topright;
3629                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3630                         int nnz, tr;
3631
3632                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3633                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3634                             assert(mb_y || linesize <= block_offset[i]);
3635                             if(!topright_avail){
3636                                 tr= ptr[3 - linesize]*0x01010101;
3637                                 topright= (uint8_t*) &tr;
3638                             }else
3639                                 topright= ptr + 4 - linesize;
3640                         }else
3641                             topright= NULL;
3642
3643                         h->pred4x4[ dir ](ptr, topright, linesize);
3644                         nnz = h->non_zero_count_cache[ scan8[i] ];
3645                         if(nnz){
3646                             if(s->codec_id == CODEC_ID_H264){
3647                                 if(nnz == 1 && h->mb[i*16])
3648                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3649                                 else
3650                                     idct_add(ptr, h->mb + i*16, linesize);
3651                             }else
3652                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3653                         }
3654                     }
3655                 }
3656             }else{
3657                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3658                 if(s->codec_id == CODEC_ID_H264){
3659                     if(!transform_bypass)
3660                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3661                 }else
3662                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3663             }
3664             if(h->deblocking_filter && !FRAME_MBAFF)
3665                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3666         }else if(s->codec_id == CODEC_ID_H264){
3667             hl_motion(h, dest_y, dest_cb, dest_cr,
3668                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3669                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3670                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3671         }
3672
3673
3674         if(!IS_INTRA4x4(mb_type)){
3675             if(s->codec_id == CODEC_ID_H264){
3676                 if(IS_INTRA16x16(mb_type)){
3677                     for(i=0; i<16; i++){
3678                         if(h->non_zero_count_cache[ scan8[i] ])
3679                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3680                         else if(h->mb[i*16])
3681                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3682                     }
3683                 }else{
3684                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3685                     for(i=0; i<16; i+=di){
3686                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3687                         if(nnz){
3688                             if(nnz==1 && h->mb[i*16])
3689                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3690                             else
3691                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3692                         }
3693                     }
3694                 }
3695             }else{
3696                 for(i=0; i<16; i++){
3697                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3698                         uint8_t * const ptr= dest_y + block_offset[i];
3699                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3700                     }
3701                 }
3702             }
3703         }
3704
3705         if(!(s->flags&CODEC_FLAG_GRAY)){
3706             uint8_t *dest[2] = {dest_cb, dest_cr};
3707             if(transform_bypass){
3708                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3709             }else{
3710                 idct_add = s->dsp.h264_idct_add;
3711                 idct_dc_add = s->dsp.h264_idct_dc_add;
3712                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3713                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3714             }
3715             if(s->codec_id == CODEC_ID_H264){
3716                 for(i=16; i<16+8; i++){
3717                     if(h->non_zero_count_cache[ scan8[i] ])
3718                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3719                     else if(h->mb[i*16])
3720                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3721                 }
3722             }else{
3723                 for(i=16; i<16+8; i++){
3724                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3725                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3726                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3727                     }
3728                 }
3729             }
3730         }
3731     }
3732     if(h->deblocking_filter) {
3733         if (FRAME_MBAFF) {
3734             //FIXME try deblocking one mb at a time?
3735             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3736             const int mb_y = s->mb_y - 1;
3737             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3738             const int mb_xy= mb_x + mb_y*s->mb_stride;
3739             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3740             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3741             if (!bottom) return;
3742             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3743             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3744             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3745
3746             if(IS_INTRA(mb_type_top | mb_type_bottom))
3747                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3748
3749             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3750             // deblock a pair
3751             // top
3752             s->mb_y--;
3753             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3754             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3755             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3756             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3757             // bottom
3758             s->mb_y++;
3759             tprintf("call mbaff filter_mb\n");
3760             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3761             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3762             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3763         } else {
3764             tprintf("call filter_mb\n");
3765             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3766             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3767             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3768         }
3769     }
3770 }
3771
3772 /**
3773  * fills the default_ref_list.
3774  */
3775 static int fill_default_ref_list(H264Context *h){
3776     MpegEncContext * const s = &h->s;
3777     int i;
3778     int smallest_poc_greater_than_current = -1;
3779     Picture sorted_short_ref[32];
3780
3781     if(h->slice_type==B_TYPE){
3782         int out_i;
3783         int limit= INT_MIN;
3784
3785         /* sort frame according to poc in B slice */
3786         for(out_i=0; out_i<h->short_ref_count; out_i++){
3787             int best_i=INT_MIN;
3788             int best_poc=INT_MAX;
3789
3790             for(i=0; i<h->short_ref_count; i++){
3791                 const int poc= h->short_ref[i]->poc;
3792                 if(poc > limit && poc < best_poc){
3793                     best_poc= poc;
3794                     best_i= i;
3795                 }
3796             }
3797
3798             assert(best_i != INT_MIN);
3799
3800             limit= best_poc;
3801             sorted_short_ref[out_i]= *h->short_ref[best_i];
3802             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3803             if (-1 == smallest_poc_greater_than_current) {
3804                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3805                     smallest_poc_greater_than_current = out_i;
3806                 }
3807             }
3808         }
3809     }
3810
3811     if(s->picture_structure == PICT_FRAME){
3812         if(h->slice_type==B_TYPE){
3813             int list;
3814             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3815
3816             // find the largest poc
3817             for(list=0; list<2; list++){
3818                 int index = 0;
3819                 int j= -99;
3820                 int step= list ? -1 : 1;
3821
3822                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3823                     while(j<0 || j>= h->short_ref_count){
3824                         if(j != -99 && step == (list ? -1 : 1))
3825                             return -1;
3826                         step = -step;
3827                         j= smallest_poc_greater_than_current + (step>>1);
3828                     }
3829                     if(sorted_short_ref[j].reference != 3) continue;
3830                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3831                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3832                 }
3833
3834                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3835                     if(h->long_ref[i] == NULL) continue;
3836                     if(h->long_ref[i]->reference != 3) continue;
3837
3838                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3839                     h->default_ref_list[ list ][index++].pic_id= i;;
3840                 }
3841
3842                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3843                     // swap the two first elements of L1 when
3844                     // L0 and L1 are identical
3845                     Picture temp= h->default_ref_list[1][0];
3846                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3847                     h->default_ref_list[1][1] = temp;
3848                 }
3849
3850                 if(index < h->ref_count[ list ])
3851                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3852             }
3853         }else{
3854             int index=0;
3855             for(i=0; i<h->short_ref_count; i++){
3856                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3857                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3858                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3859             }
3860             for(i = 0; i < 16; i++){
3861                 if(h->long_ref[i] == NULL) continue;
3862                 if(h->long_ref[i]->reference != 3) continue;
3863                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3864                 h->default_ref_list[0][index++].pic_id= i;;
3865             }
3866             if(index < h->ref_count[0])
3867                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3868         }
3869     }else{ //FIELD
3870         if(h->slice_type==B_TYPE){
3871         }else{
3872             //FIXME second field balh
3873         }
3874     }
3875 #ifdef TRACE
3876     for (i=0; i<h->ref_count[0]; i++) {
3877         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3878     }
3879     if(h->slice_type==B_TYPE){
3880         for (i=0; i<h->ref_count[1]; i++) {
3881             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3882         }
3883     }
3884 #endif
3885     return 0;
3886 }
3887
3888 static void print_short_term(H264Context *h);
3889 static void print_long_term(H264Context *h);
3890
3891 static int decode_ref_pic_list_reordering(H264Context *h){
3892     MpegEncContext * const s = &h->s;
3893     int list, index;
3894
3895     print_short_term(h);
3896     print_long_term(h);
3897     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3898
3899     for(list=0; list<2; list++){
3900         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3901
3902         if(get_bits1(&s->gb)){
3903             int pred= h->curr_pic_num;
3904
3905             for(index=0; ; index++){
3906                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3907                 int pic_id;
3908                 int i;
3909                 Picture *ref = NULL;
3910
3911                 if(reordering_of_pic_nums_idc==3)
3912                     break;
3913
3914                 if(index >= h->ref_count[list]){
3915                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3916                     return -1;
3917                 }
3918
3919                 if(reordering_of_pic_nums_idc<3){
3920                     if(reordering_of_pic_nums_idc<2){
3921                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3922
3923                         if(abs_diff_pic_num >= h->max_pic_num){
3924                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3925                             return -1;
3926                         }
3927
3928                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3929                         else                                pred+= abs_diff_pic_num;
3930                         pred &= h->max_pic_num - 1;
3931
3932                         for(i= h->short_ref_count-1; i>=0; i--){
3933                             ref = h->short_ref[i];
3934                             assert(ref->reference == 3);
3935                             assert(!ref->long_ref);
3936                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3937                                 break;
3938                         }
3939                         if(i>=0)
3940                             ref->pic_id= ref->frame_num;
3941                     }else{
3942                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3943                         ref = h->long_ref[pic_id];
3944                         ref->pic_id= pic_id;
3945                         assert(ref->reference == 3);
3946                         assert(ref->long_ref);
3947                         i=0;
3948                     }
3949
3950                     if (i < 0) {
3951                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3952                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3953                     } else {
3954                         for(i=index; i+1<h->ref_count[list]; i++){
3955                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3956                                 break;
3957                         }
3958                         for(; i > index; i--){
3959                             h->ref_list[list][i]= h->ref_list[list][i-1];
3960                         }
3961                         h->ref_list[list][index]= *ref;
3962                     }
3963                 }else{
3964                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3965                     return -1;
3966                 }
3967             }
3968         }
3969
3970         if(h->slice_type!=B_TYPE) break;
3971     }
3972     for(list=0; list<2; list++){
3973         for(index= 0; index < h->ref_count[list]; index++){
3974             if(!h->ref_list[list][index].data[0])
3975                 h->ref_list[list][index]= s->current_picture;
3976         }
3977         if(h->slice_type!=B_TYPE) break;
3978     }
3979
3980     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3981         direct_dist_scale_factor(h);
3982     direct_ref_list_init(h);
3983     return 0;
3984 }
3985
3986 static void fill_mbaff_ref_list(H264Context *h){
3987     int list, i, j;
3988     for(list=0; list<2; list++){
3989         for(i=0; i<h->ref_count[list]; i++){
3990             Picture *frame = &h->ref_list[list][i];
3991             Picture *field = &h->ref_list[list][16+2*i];
3992             field[0] = *frame;
3993             for(j=0; j<3; j++)
3994                 field[0].linesize[j] <<= 1;
3995             field[1] = field[0];
3996             for(j=0; j<3; j++)
3997                 field[1].data[j] += frame->linesize[j];
3998
3999             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4000             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4001             for(j=0; j<2; j++){
4002                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4003                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4004             }
4005         }
4006     }
4007     for(j=0; j<h->ref_count[1]; j++){
4008         for(i=0; i<h->ref_count[0]; i++)
4009             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4010         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4011         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4012     }
4013 }
4014
4015 static int pred_weight_table(H264Context *h){
4016     MpegEncContext * const s = &h->s;
4017     int list, i;
4018     int luma_def, chroma_def;
4019
4020     h->use_weight= 0;
4021     h->use_weight_chroma= 0;
4022     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4023     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4024     luma_def = 1<<h->luma_log2_weight_denom;
4025     chroma_def = 1<<h->chroma_log2_weight_denom;
4026
4027     for(list=0; list<2; list++){
4028         for(i=0; i<h->ref_count[list]; i++){
4029             int luma_weight_flag, chroma_weight_flag;
4030
4031             luma_weight_flag= get_bits1(&s->gb);
4032             if(luma_weight_flag){
4033                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4034                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4035                 if(   h->luma_weight[list][i] != luma_def
4036                    || h->luma_offset[list][i] != 0)
4037                     h->use_weight= 1;
4038             }else{
4039                 h->luma_weight[list][i]= luma_def;
4040                 h->luma_offset[list][i]= 0;
4041             }
4042
4043             chroma_weight_flag= get_bits1(&s->gb);
4044             if(chroma_weight_flag){
4045                 int j;
4046                 for(j=0; j<2; j++){
4047                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4048                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4049                     if(   h->chroma_weight[list][i][j] != chroma_def
4050                        || h->chroma_offset[list][i][j] != 0)
4051                         h->use_weight_chroma= 1;
4052                 }
4053             }else{
4054                 int j;
4055                 for(j=0; j<2; j++){
4056                     h->chroma_weight[list][i][j]= chroma_def;
4057                     h->chroma_offset[list][i][j]= 0;
4058                 }
4059             }
4060         }
4061         if(h->slice_type != B_TYPE) break;
4062     }
4063     h->use_weight= h->use_weight || h->use_weight_chroma;
4064     return 0;
4065 }
4066
4067 static void implicit_weight_table(H264Context *h){
4068     MpegEncContext * const s = &h->s;
4069     int ref0, ref1;
4070     int cur_poc = s->current_picture_ptr->poc;
4071
4072     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4073        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4074         h->use_weight= 0;
4075         h->use_weight_chroma= 0;
4076         return;
4077     }
4078
4079     h->use_weight= 2;
4080     h->use_weight_chroma= 2;
4081     h->luma_log2_weight_denom= 5;
4082     h->chroma_log2_weight_denom= 5;
4083
4084     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4085         int poc0 = h->ref_list[0][ref0].poc;
4086         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4087             int poc1 = h->ref_list[1][ref1].poc;
4088             int td = clip(poc1 - poc0, -128, 127);
4089             if(td){
4090                 int tb = clip(cur_poc - poc0, -128, 127);
4091                 int tx = (16384 + (FFABS(td) >> 1)) / td;
4092                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4093                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4094                     h->implicit_weight[ref0][ref1] = 32;
4095                 else
4096                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4097             }else
4098                 h->implicit_weight[ref0][ref1] = 32;
4099         }
4100     }
4101 }
4102
4103 static inline void unreference_pic(H264Context *h, Picture *pic){
4104     int i;
4105     pic->reference=0;
4106     if(pic == h->delayed_output_pic)
4107         pic->reference=1;
4108     else{
4109         for(i = 0; h->delayed_pic[i]; i++)
4110             if(pic == h->delayed_pic[i]){
4111                 pic->reference=1;
4112                 break;
4113             }
4114     }
4115 }
4116
4117 /**
4118  * instantaneous decoder refresh.
4119  */
4120 static void idr(H264Context *h){
4121     int i;
4122
4123     for(i=0; i<16; i++){
4124         if (h->long_ref[i] != NULL) {
4125             unreference_pic(h, h->long_ref[i]);
4126             h->long_ref[i]= NULL;
4127         }
4128     }
4129     h->long_ref_count=0;
4130
4131     for(i=0; i<h->short_ref_count; i++){
4132         unreference_pic(h, h->short_ref[i]);
4133         h->short_ref[i]= NULL;
4134     }
4135     h->short_ref_count=0;
4136 }
4137
4138 /* forget old pics after a seek */
4139 static void flush_dpb(AVCodecContext *avctx){
4140     H264Context *h= avctx->priv_data;
4141     int i;
4142     for(i=0; i<16; i++) {
4143         if(h->delayed_pic[i])
4144             h->delayed_pic[i]->reference= 0;
4145         h->delayed_pic[i]= NULL;
4146     }
4147     if(h->delayed_output_pic)
4148         h->delayed_output_pic->reference= 0;
4149     h->delayed_output_pic= NULL;
4150     idr(h);
4151     if(h->s.current_picture_ptr)
4152         h->s.current_picture_ptr->reference= 0;
4153 }
4154
4155 /**
4156  *
4157  * @return the removed picture or NULL if an error occurs
4158  */
4159 static Picture * remove_short(H264Context *h, int frame_num){
4160     MpegEncContext * const s = &h->s;
4161     int i;
4162
4163     if(s->avctx->debug&FF_DEBUG_MMCO)
4164         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4165
4166     for(i=0; i<h->short_ref_count; i++){
4167         Picture *pic= h->short_ref[i];
4168         if(s->avctx->debug&FF_DEBUG_MMCO)
4169             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4170         if(pic->frame_num == frame_num){
4171             h->short_ref[i]= NULL;
4172             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4173             h->short_ref_count--;
4174             return pic;
4175         }
4176     }
4177     return NULL;
4178 }
4179
4180 /**
4181  *
4182  * @return the removed picture or NULL if an error occurs
4183  */
4184 static Picture * remove_long(H264Context *h, int i){
4185     Picture *pic;
4186
4187     pic= h->long_ref[i];
4188     h->long_ref[i]= NULL;
4189     if(pic) h->long_ref_count--;
4190
4191     return pic;
4192 }
4193
4194 /**
4195  * print short term list
4196  */
4197 static void print_short_term(H264Context *h) {
4198     uint32_t i;
4199     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4200         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4201         for(i=0; i<h->short_ref_count; i++){
4202             Picture *pic= h->short_ref[i];
4203             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4204         }
4205     }
4206 }
4207
4208 /**
4209  * print long term list
4210  */
4211 static void print_long_term(H264Context *h) {
4212     uint32_t i;
4213     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4214         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4215         for(i = 0; i < 16; i++){
4216             Picture *pic= h->long_ref[i];
4217             if (pic) {
4218                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4219             }
4220         }
4221     }
4222 }
4223
4224 /**
4225  * Executes the reference picture marking (memory management control operations).
4226  */
4227 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4228     MpegEncContext * const s = &h->s;
4229     int i, j;
4230     int current_is_long=0;
4231     Picture *pic;
4232
4233     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4234         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4235
4236     for(i=0; i<mmco_count; i++){
4237         if(s->avctx->debug&FF_DEBUG_MMCO)
4238             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4239
4240         switch(mmco[i].opcode){
4241         case MMCO_SHORT2UNUSED:
4242             pic= remove_short(h, mmco[i].short_frame_num);
4243             if(pic)
4244                 unreference_pic(h, pic);
4245             else if(s->avctx->debug&FF_DEBUG_MMCO)
4246                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4247             break;
4248         case MMCO_SHORT2LONG:
4249             pic= remove_long(h, mmco[i].long_index);
4250             if(pic) unreference_pic(h, pic);
4251
4252             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4253             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4254             h->long_ref_count++;
4255             break;
4256         case MMCO_LONG2UNUSED:
4257             pic= remove_long(h, mmco[i].long_index);
4258             if(pic)
4259                 unreference_pic(h, pic);
4260             else if(s->avctx->debug&FF_DEBUG_MMCO)
4261                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4262             break;
4263         case MMCO_LONG:
4264             pic= remove_long(h, mmco[i].long_index);
4265             if(pic) unreference_pic(h, pic);
4266
4267             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4268             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4269             h->long_ref_count++;
4270
4271             current_is_long=1;
4272             break;
4273         case MMCO_SET_MAX_LONG:
4274             assert(mmco[i].long_index <= 16);
4275             // just remove the long term which index is greater than new max
4276             for(j = mmco[i].long_index; j<16; j++){
4277                 pic = remove_long(h, j);
4278                 if (pic) unreference_pic(h, pic);
4279             }
4280             break;
4281         case MMCO_RESET:
4282             while(h->short_ref_count){
4283                 pic= remove_short(h, h->short_ref[0]->frame_num);
4284                 unreference_pic(h, pic);
4285             }
4286             for(j = 0; j < 16; j++) {
4287                 pic= remove_long(h, j);
4288                 if(pic) unreference_pic(h, pic);
4289             }
4290             break;
4291         default: assert(0);
4292         }
4293     }
4294
4295     if(!current_is_long){
4296         pic= remove_short(h, s->current_picture_ptr->frame_num);
4297         if(pic){
4298             unreference_pic(h, pic);
4299             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4300         }
4301
4302         if(h->short_ref_count)
4303             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4304
4305         h->short_ref[0]= s->current_picture_ptr;
4306         h->short_ref[0]->long_ref=0;
4307         h->short_ref_count++;
4308     }
4309
4310     print_short_term(h);
4311     print_long_term(h);
4312     return 0;
4313 }
4314
4315 static int decode_ref_pic_marking(H264Context *h){
4316     MpegEncContext * const s = &h->s;
4317     int i;
4318
4319     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4320         s->broken_link= get_bits1(&s->gb) -1;
4321         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4322         if(h->mmco[0].long_index == -1)
4323             h->mmco_index= 0;
4324         else{
4325             h->mmco[0].opcode= MMCO_LONG;
4326             h->mmco_index= 1;
4327         }
4328     }else{
4329         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4330             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4331                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4332
4333                 h->mmco[i].opcode= opcode;
4334                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4335                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4336 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4337                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4338                         return -1;
4339                     }*/
4340                 }
4341                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4342                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4343                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4344                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4345                         return -1;
4346                     }
4347                 }
4348
4349                 if(opcode > MMCO_LONG){
4350                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4351                     return -1;
4352                 }
4353                 if(opcode == MMCO_END)
4354                     break;
4355             }
4356             h->mmco_index= i;
4357         }else{
4358             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4359
4360             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4361                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4362                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4363                 h->mmco_index= 1;
4364             }else
4365                 h->mmco_index= 0;
4366         }
4367     }
4368
4369     return 0;
4370 }
4371
4372 static int init_poc(H264Context *h){
4373     MpegEncContext * const s = &h->s;
4374     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4375     int field_poc[2];
4376
4377     if(h->nal_unit_type == NAL_IDR_SLICE){
4378         h->frame_num_offset= 0;
4379     }else{
4380         if(h->frame_num < h->prev_frame_num)
4381             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4382         else
4383             h->frame_num_offset= h->prev_frame_num_offset;
4384     }
4385
4386     if(h->sps.poc_type==0){
4387         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4388
4389         if(h->nal_unit_type == NAL_IDR_SLICE){
4390              h->prev_poc_msb=
4391              h->prev_poc_lsb= 0;
4392         }
4393
4394         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4395             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4396         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4397             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4398         else
4399             h->poc_msb = h->prev_poc_msb;
4400 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4401         field_poc[0] =
4402         field_poc[1] = h->poc_msb + h->poc_lsb;
4403         if(s->picture_structure == PICT_FRAME)
4404             field_poc[1] += h->delta_poc_bottom;
4405     }else if(h->sps.poc_type==1){
4406         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4407         int i;
4408
4409         if(h->sps.poc_cycle_length != 0)
4410             abs_frame_num = h->frame_num_offset + h->frame_num;
4411         else
4412             abs_frame_num = 0;
4413
4414         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4415             abs_frame_num--;
4416
4417         expected_delta_per_poc_cycle = 0;
4418         for(i=0; i < h->sps.poc_cycle_length; i++)
4419             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4420
4421         if(abs_frame_num > 0){
4422             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4423             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4424
4425             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4426             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4427                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4428         } else
4429             expectedpoc = 0;
4430
4431         if(h->nal_ref_idc == 0)
4432             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4433
4434         field_poc[0] = expectedpoc + h->delta_poc[0];
4435         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4436
4437         if(s->picture_structure == PICT_FRAME)
4438             field_poc[1] += h->delta_poc[1];
4439     }else{
4440         int poc;
4441         if(h->nal_unit_type == NAL_IDR_SLICE){
4442             poc= 0;
4443         }else{
4444             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4445             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4446         }
4447         field_poc[0]= poc;
4448         field_poc[1]= poc;
4449     }
4450
4451     if(s->picture_structure != PICT_BOTTOM_FIELD)
4452         s->current_picture_ptr->field_poc[0]= field_poc[0];
4453     if(s->picture_structure != PICT_TOP_FIELD)
4454         s->current_picture_ptr->field_poc[1]= field_poc[1];
4455     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4456         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4457
4458     return 0;
4459 }
4460
4461 /**
4462  * decodes a slice header.
4463  * this will allso call MPV_common_init() and frame_start() as needed
4464  */
4465 static int decode_slice_header(H264Context *h){
4466     MpegEncContext * const s = &h->s;
4467     int first_mb_in_slice, pps_id;
4468     int num_ref_idx_active_override_flag;
4469     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4470     int slice_type;
4471     int default_ref_list_done = 0;
4472
4473     s->current_picture.reference= h->nal_ref_idc != 0;
4474     s->dropable= h->nal_ref_idc == 0;
4475
4476     first_mb_in_slice= get_ue_golomb(&s->gb);
4477
4478     slice_type= get_ue_golomb(&s->gb);
4479     if(slice_type > 9){
4480         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4481         return -1;
4482     }
4483     if(slice_type > 4){
4484         slice_type -= 5;
4485         h->slice_type_fixed=1;
4486     }else
4487         h->slice_type_fixed=0;
4488
4489     slice_type= slice_type_map[ slice_type ];
4490     if (slice_type == I_TYPE
4491         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4492         default_ref_list_done = 1;
4493     }
4494     h->slice_type= slice_type;
4495
4496     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4497
4498     pps_id= get_ue_golomb(&s->gb);
4499     if(pps_id>255){
4500         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4501         return -1;
4502     }
4503     h->pps= h->pps_buffer[pps_id];
4504     if(h->pps.slice_group_count == 0){
4505         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4506         return -1;
4507     }
4508
4509     h->sps= h->sps_buffer[ h->pps.sps_id ];
4510     if(h->sps.log2_max_frame_num == 0){
4511         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4512         return -1;
4513     }
4514
4515     if(h->dequant_coeff_pps != pps_id){
4516         h->dequant_coeff_pps = pps_id;
4517         init_dequant_tables(h);
4518     }
4519
4520     s->mb_width= h->sps.mb_width;
4521     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4522
4523     h->b_stride=  s->mb_width*4;
4524     h->b8_stride= s->mb_width*2;
4525
4526     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4527     if(h->sps.frame_mbs_only_flag)
4528         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4529     else
4530         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4531
4532     if (s->context_initialized
4533         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4534         free_tables(h);
4535         MPV_common_end(s);
4536     }
4537     if (!s->context_initialized) {
4538         if (MPV_common_init(s) < 0)
4539             return -1;
4540
4541         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4542             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4543             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4544         }else{
4545             int i;
4546             for(i=0; i<16; i++){
4547 #define T(x) (x>>2) | ((x<<2) & 0xF)
4548                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4549                 h-> field_scan[i] = T( field_scan[i]);
4550 #undef T
4551             }
4552         }
4553         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4554             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4555             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4556             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4557             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4558         }else{
4559             int i;
4560             for(i=0; i<64; i++){
4561 #define T(x) (x>>3) | ((x&7)<<3)
4562                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4563                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4564                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4565                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4566 #undef T
4567             }
4568         }
4569         if(h->sps.transform_bypass){ //FIXME same ugly
4570             h->zigzag_scan_q0          = zigzag_scan;
4571             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4572             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4573             h->field_scan_q0           = field_scan;
4574             h->field_scan8x8_q0        = field_scan8x8;
4575             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4576         }else{
4577             h->zigzag_scan_q0          = h->zigzag_scan;
4578             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4579             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4580             h->field_scan_q0           = h->field_scan;
4581             h->field_scan8x8_q0        = h->field_scan8x8;
4582             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4583         }
4584
4585         alloc_tables(h);
4586
4587         s->avctx->width = s->width;
4588         s->avctx->height = s->height;
4589         s->avctx->sample_aspect_ratio= h->sps.sar;
4590         if(!s->avctx->sample_aspect_ratio.den)
4591             s->avctx->sample_aspect_ratio.den = 1;
4592
4593         if(h->sps.timing_info_present_flag){
4594             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4595             if(h->x264_build > 0 && h->x264_build < 44)
4596                 s->avctx->time_base.den *= 2;
4597             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4598                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4599         }
4600     }
4601
4602     if(h->slice_num == 0){
4603         if(frame_start(h) < 0)
4604             return -1;
4605     }
4606
4607     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4608     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4609
4610     h->mb_mbaff = 0;
4611     h->mb_aff_frame = 0;
4612     if(h->sps.frame_mbs_only_flag){
4613         s->picture_structure= PICT_FRAME;
4614     }else{
4615         if(get_bits1(&s->gb)) { //field_pic_flag
4616             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4617             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4618         } else {
4619             s->picture_structure= PICT_FRAME;
4620             h->mb_aff_frame = h->sps.mb_aff;
4621         }
4622     }
4623
4624     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4625     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4626     if(s->mb_y >= s->mb_height){
4627         return -1;
4628     }
4629
4630     if(s->picture_structure==PICT_FRAME){
4631         h->curr_pic_num=   h->frame_num;
4632         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4633     }else{
4634         h->curr_pic_num= 2*h->frame_num;
4635         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4636     }
4637
4638     if(h->nal_unit_type == NAL_IDR_SLICE){
4639         get_ue_golomb(&s->gb); /* idr_pic_id */
4640     }
4641
4642     if(h->sps.poc_type==0){
4643         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4644
4645         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4646             h->delta_poc_bottom= get_se_golomb(&s->gb);
4647         }
4648     }
4649
4650     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4651         h->delta_poc[0]= get_se_golomb(&s->gb);
4652
4653         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4654             h->delta_poc[1]= get_se_golomb(&s->gb);
4655     }
4656
4657     init_poc(h);
4658
4659     if(h->pps.redundant_pic_cnt_present){
4660         h->redundant_pic_count= get_ue_golomb(&s->gb);
4661     }
4662
4663     //set defaults, might be overriden a few line later
4664     h->ref_count[0]= h->pps.ref_count[0];
4665     h->ref_count[1]= h->pps.ref_count[1];
4666
4667     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4668         if(h->slice_type == B_TYPE){
4669             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4670             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4671                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4672         }
4673         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4674
4675         if(num_ref_idx_active_override_flag){
4676             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4677             if(h->slice_type==B_TYPE)
4678                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4679
4680             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4681                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4682                 return -1;
4683             }
4684         }
4685     }
4686
4687     if(!default_ref_list_done){
4688         fill_default_ref_list(h);
4689     }
4690
4691     if(decode_ref_pic_list_reordering(h) < 0)
4692         return -1;
4693
4694     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4695        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4696         pred_weight_table(h);
4697     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4698         implicit_weight_table(h);
4699     else
4700         h->use_weight = 0;
4701
4702     if(s->current_picture.reference)
4703         decode_ref_pic_marking(h);
4704
4705     if(FRAME_MBAFF)
4706         fill_mbaff_ref_list(h);
4707
4708     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4709         h->cabac_init_idc = get_ue_golomb(&s->gb);
4710
4711     h->last_qscale_diff = 0;
4712     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4713     if(s->qscale<0 || s->qscale>51){
4714         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4715         return -1;
4716     }
4717     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4718     //FIXME qscale / qp ... stuff
4719     if(h->slice_type == SP_TYPE){
4720         get_bits1(&s->gb); /* sp_for_switch_flag */
4721     }
4722     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4723         get_se_golomb(&s->gb); /* slice_qs_delta */
4724     }
4725
4726     h->deblocking_filter = 1;
4727     h->slice_alpha_c0_offset = 0;
4728     h->slice_beta_offset = 0;
4729     if( h->pps.deblocking_filter_parameters_present ) {
4730         h->deblocking_filter= get_ue_golomb(&s->gb);
4731         if(h->deblocking_filter < 2)
4732             h->deblocking_filter^= 1; // 1<->0
4733
4734         if( h->deblocking_filter ) {
4735             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4736             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4737         }
4738     }
4739     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4740        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4741        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4742        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4743         h->deblocking_filter= 0;
4744
4745 #if 0 //FMO
4746     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4747         slice_group_change_cycle= get_bits(&s->gb, ?);
4748 #endif
4749
4750     h->slice_num++;
4751
4752     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4753     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4754
4755     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4756         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4757                h->slice_num,
4758                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4759                first_mb_in_slice,
4760                av_get_pict_type_char(h->slice_type),
4761                pps_id, h->frame_num,
4762                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4763                h->ref_count[0], h->ref_count[1],
4764                s->qscale,
4765                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4766                h->use_weight,
4767                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4768                );
4769     }
4770
4771     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4772         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4773         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4774     }else{
4775         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4776         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4777     }
4778
4779     return 0;
4780 }
4781
4782 /**
4783  *
4784  */
4785 static inline int get_level_prefix(GetBitContext *gb){
4786     unsigned int buf;
4787     int log;
4788
4789     OPEN_READER(re, gb);
4790     UPDATE_CACHE(re, gb);
4791     buf=GET_CACHE(re, gb);
4792
4793     log= 32 - av_log2(buf);
4794 #ifdef TRACE
4795     print_bin(buf>>(32-log), log);
4796     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4797 #endif
4798
4799     LAST_SKIP_BITS(re, gb, log);
4800     CLOSE_READER(re, gb);
4801
4802     return log-1;
4803 }
4804
4805 static inline int get_dct8x8_allowed(H264Context *h){
4806     int i;
4807     for(i=0; i<4; i++){
4808         if(!IS_SUB_8X8(h->sub_mb_type[i])
4809            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4810             return 0;
4811     }
4812     return 1;
4813 }
4814
4815 /**
4816  * decodes a residual block.
4817  * @param n block index
4818  * @param scantable scantable
4819  * @param max_coeff number of coefficients in the block
4820  * @return <0 if an error occured
4821  */
4822 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4823     MpegEncContext * const s = &h->s;
4824     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4825     int level[16];
4826     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4827
4828     //FIXME put trailing_onex into the context
4829
4830     if(n == CHROMA_DC_BLOCK_INDEX){
4831         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4832         total_coeff= coeff_token>>2;
4833     }else{
4834         if(n == LUMA_DC_BLOCK_INDEX){
4835             total_coeff= pred_non_zero_count(h, 0);
4836             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4837             total_coeff= coeff_token>>2;
4838         }else{
4839             total_coeff= pred_non_zero_count(h, n);
4840             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4841             total_coeff= coeff_token>>2;
4842             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4843         }
4844     }
4845
4846     //FIXME set last_non_zero?
4847
4848     if(total_coeff==0)
4849         return 0;
4850     if(total_coeff<0) {
4851         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff<0)\n", s->mb_x, s->mb_y);
4852         return -1;
4853     }
4854
4855     trailing_ones= coeff_token&3;
4856     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4857     assert(total_coeff<=16);
4858
4859     for(i=0; i<trailing_ones; i++){
4860         level[i]= 1 - 2*get_bits1(gb);
4861     }
4862
4863     if(i<total_coeff) {
4864         int level_code, mask;
4865         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4866         int prefix= get_level_prefix(gb);
4867
4868         //first coefficient has suffix_length equal to 0 or 1
4869         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4870             if(suffix_length)
4871                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4872             else
4873                 level_code= (prefix<<suffix_length); //part
4874         }else if(prefix==14){
4875             if(suffix_length)
4876                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4877             else
4878                 level_code= prefix + get_bits(gb, 4); //part
4879         }else if(prefix==15){
4880             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4881             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4882         }else{
4883             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4884             return -1;
4885         }
4886
4887         if(trailing_ones < 3) level_code += 2;
4888
4889         suffix_length = 1;
4890         if(level_code > 5)
4891             suffix_length++;
4892         mask= -(level_code&1);
4893         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4894         i++;
4895
4896         //remaining coefficients have suffix_length > 0
4897         for(;i<total_coeff;i++) {
4898             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4899             prefix = get_level_prefix(gb);
4900             if(prefix<15){
4901                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4902             }else if(prefix==15){
4903                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4904             }else{
4905                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4906                 return -1;
4907             }
4908             mask= -(level_code&1);
4909             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4910             if(level_code > suffix_limit[suffix_length])
4911                 suffix_length++;
4912         }
4913     }
4914
4915     if(total_coeff == max_coeff)
4916         zeros_left=0;
4917     else{
4918         if(n == CHROMA_DC_BLOCK_INDEX)
4919             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4920         else
4921             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4922     }
4923
4924     coeff_num = zeros_left + total_coeff - 1;
4925     j = scantable[coeff_num];
4926     if(n > 24){
4927         block[j] = level[0];
4928         for(i=1;i<total_coeff;i++) {
4929             if(zeros_left <= 0)
4930                 run_before = 0;
4931             else if(zeros_left < 7){
4932                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4933             }else{
4934                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4935             }
4936             zeros_left -= run_before;
4937             coeff_num -= 1 + run_before;
4938             j= scantable[ coeff_num ];
4939
4940             block[j]= level[i];
4941         }
4942     }else{
4943         block[j] = (level[0] * qmul[j] + 32)>>6;
4944         for(i=1;i<total_coeff;i++) {
4945             if(zeros_left <= 0)
4946                 run_before = 0;
4947             else if(zeros_left < 7){
4948                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4949             }else{
4950                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4951             }
4952             zeros_left -= run_before;
4953             coeff_num -= 1 + run_before;
4954             j= scantable[ coeff_num ];
4955
4956             block[j]= (level[i] * qmul[j] + 32)>>6;
4957         }
4958     }
4959
4960     if(zeros_left<0){
4961         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4962         return -1;
4963     }
4964
4965     return 0;
4966 }
4967
4968 static void predict_field_decoding_flag(H264Context *h){
4969     MpegEncContext * const s = &h->s;
4970     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4971     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4972                 ? s->current_picture.mb_type[mb_xy-1]
4973                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4974                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4975                 : 0;
4976     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4977 }
4978
4979 /**
4980  * decodes a P_SKIP or B_SKIP macroblock
4981  */
4982 static void decode_mb_skip(H264Context *h){
4983     MpegEncContext * const s = &h->s;
4984     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4985     int mb_type=0;
4986
4987     memset(h->non_zero_count[mb_xy], 0, 16);
4988     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4989
4990     if(MB_FIELD)
4991         mb_type|= MB_TYPE_INTERLACED;
4992
4993     if( h->slice_type == B_TYPE )
4994     {
4995         // just for fill_caches. pred_direct_motion will set the real mb_type
4996         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4997
4998         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4999         pred_direct_motion(h, &mb_type);
5000         mb_type|= MB_TYPE_SKIP;
5001     }
5002     else
5003     {
5004         int mx, my;
5005         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5006
5007         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5008         pred_pskip_motion(h, &mx, &my);
5009         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5010         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5011     }
5012
5013     write_back_motion(h, mb_type);
5014     s->current_picture.mb_type[mb_xy]= mb_type;
5015     s->current_picture.qscale_table[mb_xy]= s->qscale;
5016     h->slice_table[ mb_xy ]= h->slice_num;
5017     h->prev_mb_skipped= 1;
5018 }
5019
5020 /**
5021  * decodes a macroblock
5022  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5023  */
5024 static int decode_mb_cavlc(H264Context *h){
5025     MpegEncContext * const s = &h->s;
5026     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5027     int mb_type, partition_count, cbp;
5028     int dct8x8_allowed= h->pps.transform_8x8_mode;
5029
5030     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5031
5032     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5033     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5034                 down the code */
5035     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5036         if(s->mb_skip_run==-1)
5037             s->mb_skip_run= get_ue_golomb(&s->gb);
5038
5039         if (s->mb_skip_run--) {
5040             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5041                 if(s->mb_skip_run==0)
5042                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5043                 else
5044                     predict_field_decoding_flag(h);
5045             }
5046             decode_mb_skip(h);
5047             return 0;
5048         }
5049     }
5050     if(FRAME_MBAFF){
5051         if( (s->mb_y&1) == 0 )
5052             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5053     }else
5054         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5055
5056     h->prev_mb_skipped= 0;
5057
5058     mb_type= get_ue_golomb(&s->gb);
5059     if(h->slice_type == B_TYPE){
5060         if(mb_type < 23){
5061             partition_count= b_mb_type_info[mb_type].partition_count;
5062             mb_type=         b_mb_type_info[mb_type].type;
5063         }else{
5064             mb_type -= 23;
5065             goto decode_intra_mb;
5066         }
5067     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5068         if(mb_type < 5){
5069             partition_count= p_mb_type_info[mb_type].partition_count;
5070             mb_type=         p_mb_type_info[mb_type].type;
5071         }else{
5072             mb_type -= 5;
5073             goto decode_intra_mb;
5074         }
5075     }else{
5076        assert(h->slice_type == I_TYPE);
5077 decode_intra_mb:
5078         if(mb_type > 25){
5079             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5080             return -1;
5081         }
5082         partition_count=0;
5083         cbp= i_mb_type_info[mb_type].cbp;
5084         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5085         mb_type= i_mb_type_info[mb_type].type;
5086     }
5087
5088     if(MB_FIELD)
5089         mb_type |= MB_TYPE_INTERLACED;
5090
5091     h->slice_table[ mb_xy ]= h->slice_num;
5092
5093     if(IS_INTRA_PCM(mb_type)){
5094         unsigned int x, y;
5095
5096         // we assume these blocks are very rare so we dont optimize it
5097         align_get_bits(&s->gb);
5098
5099         // The pixels are stored in the same order as levels in h->mb array.
5100         for(y=0; y<16; y++){
5101             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5102             for(x=0; x<16; x++){
5103                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5104                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5105             }
5106         }
5107         for(y=0; y<8; y++){
5108             const int index= 256 + 4*(y&3) + 32*(y>>2);
5109             for(x=0; x<8; x++){
5110                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5111                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5112             }
5113         }
5114         for(y=0; y<8; y++){
5115             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5116             for(x=0; x<8; x++){
5117                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5118                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5119             }
5120         }
5121
5122         // In deblocking, the quantizer is 0
5123         s->current_picture.qscale_table[mb_xy]= 0;
5124         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5125         // All coeffs are present
5126         memset(h->non_zero_count[mb_xy], 16, 16);
5127
5128         s->current_picture.mb_type[mb_xy]= mb_type;
5129         return 0;
5130     }
5131
5132     if(MB_MBAFF){
5133         h->ref_count[0] <<= 1;
5134         h->ref_count[1] <<= 1;
5135     }
5136
5137     fill_caches(h, mb_type, 0);
5138
5139     //mb_pred
5140     if(IS_INTRA(mb_type)){
5141 //            init_top_left_availability(h);
5142             if(IS_INTRA4x4(mb_type)){
5143                 int i;
5144                 int di = 1;
5145                 if(dct8x8_allowed && get_bits1(&s->gb)){
5146                     mb_type |= MB_TYPE_8x8DCT;
5147                     di = 4;
5148                 }
5149
5150 //                fill_intra4x4_pred_table(h);
5151                 for(i=0; i<16; i+=di){
5152                     int mode= pred_intra_mode(h, i);
5153
5154                     if(!get_bits1(&s->gb)){
5155                         const int rem_mode= get_bits(&s->gb, 3);
5156                         mode = rem_mode + (rem_mode >= mode);
5157                     }
5158
5159                     if(di==4)
5160                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5161                     else
5162                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5163                 }
5164                 write_back_intra_pred_mode(h);
5165                 if( check_intra4x4_pred_mode(h) < 0)
5166                     return -1;
5167             }else{
5168                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5169                 if(h->intra16x16_pred_mode < 0)
5170                     return -1;
5171             }
5172             h->chroma_pred_mode= get_ue_golomb(&s->gb);
5173
5174             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
5175             if(h->chroma_pred_mode < 0)
5176                 return -1;
5177     }else if(partition_count==4){
5178         int i, j, sub_partition_count[4], list, ref[2][4];
5179
5180         if(h->slice_type == B_TYPE){
5181             for(i=0; i<4; i++){
5182                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5183                 if(h->sub_mb_type[i] >=13){
5184                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5185                     return -1;
5186                 }
5187                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5188                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5189             }
5190             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5191                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5192                 pred_direct_motion(h, &mb_type);
5193                 h->ref_cache[0][scan8[4]] =
5194                 h->ref_cache[1][scan8[4]] =
5195                 h->ref_cache[0][scan8[12]] =
5196                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5197             }
5198         }else{
5199             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5200             for(i=0; i<4; i++){
5201                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5202                 if(h->sub_mb_type[i] >=4){
5203                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5204                     return -1;
5205                 }
5206                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5207                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5208             }
5209         }
5210
5211         for(list=0; list<2; list++){
5212             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5213             if(ref_count == 0) continue;
5214             for(i=0; i<4; i++){
5215                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5216                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5217                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5218                 }else{
5219                  //FIXME
5220                     ref[list][i] = -1;
5221                 }
5222             }
5223         }
5224
5225         if(dct8x8_allowed)
5226             dct8x8_allowed = get_dct8x8_allowed(h);
5227
5228         for(list=0; list<2; list++){
5229             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5230             if(ref_count == 0) continue;
5231
5232             for(i=0; i<4; i++){
5233                 if(IS_DIRECT(h->sub_mb_type[i])) {
5234                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5235                     continue;
5236                 }
5237                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5238                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5239
5240                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5241                     const int sub_mb_type= h->sub_mb_type[i];
5242                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5243                     for(j=0; j<sub_partition_count[i]; j++){
5244                         int mx, my;
5245                         const int index= 4*i + block_width*j;
5246                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5247                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5248                         mx += get_se_golomb(&s->gb);
5249                         my += get_se_golomb(&s->gb);
5250                         tprintf("final mv:%d %d\n", mx, my);
5251
5252                         if(IS_SUB_8X8(sub_mb_type)){
5253                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5254                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5255                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5256                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5257                         }else if(IS_SUB_8X4(sub_mb_type)){
5258                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5259                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5260                         }else if(IS_SUB_4X8(sub_mb_type)){
5261                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5262                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5263                         }else{
5264                             assert(IS_SUB_4X4(sub_mb_type));
5265                             mv_cache[ 0 ][0]= mx;
5266                             mv_cache[ 0 ][1]= my;
5267                         }
5268                     }
5269                 }else{
5270                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5271                     p[0] = p[1]=
5272                     p[8] = p[9]= 0;
5273                 }
5274             }
5275         }
5276     }else if(IS_DIRECT(mb_type)){
5277         pred_direct_motion(h, &mb_type);
5278         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5279     }else{
5280         int list, mx, my, i;
5281          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5282         if(IS_16X16(mb_type)){
5283             for(list=0; list<2; list++){
5284                 if(h->ref_count[list]>0){
5285                     if(IS_DIR(mb_type, 0, list)){
5286                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5287                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5288                     }else
5289                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5290                 }
5291             }
5292             for(list=0; list<2; list++){
5293                 if(IS_DIR(mb_type, 0, list)){
5294                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5295                     mx += get_se_golomb(&s->gb);
5296                     my += get_se_golomb(&s->gb);
5297                     tprintf("final mv:%d %d\n", mx, my);
5298
5299                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5300                 }else
5301                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5302             }
5303         }
5304         else if(IS_16X8(mb_type)){
5305             for(list=0; list<2; list++){
5306                 if(h->ref_count[list]>0){
5307                     for(i=0; i<2; i++){
5308                         if(IS_DIR(mb_type, i, list)){
5309                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5310                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5311                         }else
5312                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5313                     }
5314                 }
5315             }
5316             for(list=0; list<2; list++){
5317                 for(i=0; i<2; i++){
5318                     if(IS_DIR(mb_type, i, list)){
5319                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5320                         mx += get_se_golomb(&s->gb);
5321                         my += get_se_golomb(&s->gb);
5322                         tprintf("final mv:%d %d\n", mx, my);
5323
5324                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5325                     }else
5326                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5327                 }
5328             }
5329         }else{
5330             assert(IS_8X16(mb_type));
5331             for(list=0; list<2; list++){
5332                 if(h->ref_count[list]>0){
5333                     for(i=0; i<2; i++){
5334                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5335                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5336                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5337                         }else
5338                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5339                     }
5340                 }
5341             }
5342             for(list=0; list<2; list++){
5343                 for(i=0; i<2; i++){
5344                     if(IS_DIR(mb_type, i, list)){
5345                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5346                         mx += get_se_golomb(&s->gb);
5347                         my += get_se_golomb(&s->gb);
5348                         tprintf("final mv:%d %d\n", mx, my);
5349
5350                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5351                     }else
5352                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5353                 }
5354             }
5355         }
5356     }
5357
5358     if(IS_INTER(mb_type))
5359         write_back_motion(h, mb_type);
5360
5361     if(!IS_INTRA16x16(mb_type)){
5362         cbp= get_ue_golomb(&s->gb);
5363         if(cbp > 47){
5364             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5365             return -1;
5366         }
5367
5368         if(IS_INTRA4x4(mb_type))
5369             cbp= golomb_to_intra4x4_cbp[cbp];
5370         else
5371             cbp= golomb_to_inter_cbp[cbp];
5372     }
5373     h->cbp = cbp;
5374
5375     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5376         if(get_bits1(&s->gb))
5377             mb_type |= MB_TYPE_8x8DCT;
5378     }
5379     s->current_picture.mb_type[mb_xy]= mb_type;
5380
5381     if(cbp || IS_INTRA16x16(mb_type)){
5382         int i8x8, i4x4, chroma_idx;
5383         int chroma_qp, dquant;
5384         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5385         const uint8_t *scan, *scan8x8, *dc_scan;
5386
5387 //        fill_non_zero_count_cache(h);
5388
5389         if(IS_INTERLACED(mb_type)){
5390             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5391             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5392             dc_scan= luma_dc_field_scan;
5393         }else{
5394             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5395             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5396             dc_scan= luma_dc_zigzag_scan;
5397         }
5398
5399         dquant= get_se_golomb(&s->gb);
5400
5401         if( dquant > 25 || dquant < -26 ){
5402             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5403             return -1;
5404         }
5405
5406         s->qscale += dquant;
5407         if(((unsigned)s->qscale) > 51){
5408             if(s->qscale<0) s->qscale+= 52;
5409             else            s->qscale-= 52;
5410         }
5411
5412         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5413         if(IS_INTRA16x16(mb_type)){
5414             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5415                 return -1; //FIXME continue if partitioned and other return -1 too
5416             }
5417
5418             assert((cbp&15) == 0 || (cbp&15) == 15);
5419
5420             if(cbp&15){
5421                 for(i8x8=0; i8x8<4; i8x8++){
5422                     for(i4x4=0; i4x4<4; i4x4++){
5423                         const int index= i4x4 + 4*i8x8;
5424                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5425                             return -1;
5426                         }
5427                     }
5428                 }
5429             }else{
5430                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5431             }
5432         }else{
5433             for(i8x8=0; i8x8<4; i8x8++){
5434                 if(cbp & (1<<i8x8)){
5435                     if(IS_8x8DCT(mb_type)){
5436                         DCTELEM *buf = &h->mb[64*i8x8];
5437                         uint8_t *nnz;
5438                         for(i4x4=0; i4x4<4; i4x4++){
5439                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5440                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5441                                 return -1;
5442                         }
5443                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5444                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5445                     }else{
5446                         for(i4x4=0; i4x4<4; i4x4++){
5447                             const int index= i4x4 + 4*i8x8;
5448
5449                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5450                                 return -1;
5451                             }
5452                         }
5453                     }
5454                 }else{
5455                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5456                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5457                 }
5458             }
5459         }
5460
5461         if(cbp&0x30){
5462             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5463                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5464                     return -1;
5465                 }
5466         }
5467
5468         if(cbp&0x20){
5469             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5470                 for(i4x4=0; i4x4<4; i4x4++){
5471                     const int index= 16 + 4*chroma_idx + i4x4;
5472                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5473                         return -1;
5474                     }
5475                 }
5476             }
5477         }else{
5478             uint8_t * const nnz= &h->non_zero_count_cache[0];
5479             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5480             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5481         }
5482     }else{
5483         uint8_t * const nnz= &h->non_zero_count_cache[0];
5484         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5485         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5486         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5487     }
5488     s->current_picture.qscale_table[mb_xy]= s->qscale;
5489     write_back_non_zero_count(h);
5490
5491     if(MB_MBAFF){
5492         h->ref_count[0] >>= 1;
5493         h->ref_count[1] >>= 1;
5494     }
5495
5496     return 0;
5497 }
5498
5499 static int decode_cabac_field_decoding_flag(H264Context *h) {
5500     MpegEncContext * const s = &h->s;
5501     const int mb_x = s->mb_x;
5502     const int mb_y = s->mb_y & ~1;
5503     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5504     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5505
5506     unsigned int ctx = 0;
5507
5508     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5509         ctx += 1;
5510     }
5511     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5512         ctx += 1;
5513     }
5514
5515     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5516 }
5517
5518 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5519     uint8_t *state= &h->cabac_state[ctx_base];
5520     int mb_type;
5521
5522     if(intra_slice){
5523         MpegEncContext * const s = &h->s;
5524         const int mba_xy = h->left_mb_xy[0];
5525         const int mbb_xy = h->top_mb_xy;
5526         int ctx=0;
5527         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5528             ctx++;
5529         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5530             ctx++;
5531         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5532             return 0;   /* I4x4 */
5533         state += 2;
5534     }else{
5535         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5536             return 0;   /* I4x4 */
5537     }
5538
5539     if( get_cabac_terminate( &h->cabac ) )
5540         return 25;  /* PCM */
5541
5542     mb_type = 1; /* I16x16 */
5543     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5544     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5545         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5546     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5547     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5548     return mb_type;
5549 }
5550
5551 static int decode_cabac_mb_type( H264Context *h ) {
5552     MpegEncContext * const s = &h->s;
5553
5554     if( h->slice_type == I_TYPE ) {
5555         return decode_cabac_intra_mb_type(h, 3, 1);
5556     } else if( h->slice_type == P_TYPE ) {
5557         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5558             /* P-type */
5559             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5560                 /* P_L0_D16x16, P_8x8 */
5561                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5562             } else {
5563                 /* P_L0_D8x16, P_L0_D16x8 */
5564                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5565             }
5566         } else {
5567             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5568         }
5569     } else if( h->slice_type == B_TYPE ) {
5570         const int mba_xy = h->left_mb_xy[0];
5571         const int mbb_xy = h->top_mb_xy;
5572         int ctx = 0;
5573         int bits;
5574
5575         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5576             ctx++;
5577         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5578             ctx++;
5579
5580         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5581             return 0; /* B_Direct_16x16 */
5582
5583         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5584             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5585         }
5586
5587         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5588         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5589         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5590         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5591         if( bits < 8 )
5592             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5593         else if( bits == 13 ) {
5594             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5595         } else if( bits == 14 )
5596             return 11; /* B_L1_L0_8x16 */
5597         else if( bits == 15 )
5598             return 22; /* B_8x8 */
5599
5600         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5601         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5602     } else {
5603         /* TODO SI/SP frames? */
5604         return -1;
5605     }
5606 }
5607
5608 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5609     MpegEncContext * const s = &h->s;
5610     int mba_xy, mbb_xy;
5611     int ctx = 0;
5612
5613     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5614         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5615         mba_xy = mb_xy - 1;
5616         if( (mb_y&1)
5617             && h->slice_table[mba_xy] == h->slice_num
5618             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5619             mba_xy += s->mb_stride;
5620         if( MB_FIELD ){
5621             mbb_xy = mb_xy - s->mb_stride;
5622             if( !(mb_y&1)
5623                 && h->slice_table[mbb_xy] == h->slice_num
5624                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5625                 mbb_xy -= s->mb_stride;
5626         }else
5627             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5628     }else{
5629         int mb_xy = mb_x + mb_y*s->mb_stride;
5630         mba_xy = mb_xy - 1;
5631         mbb_xy = mb_xy - s->mb_stride;
5632     }
5633
5634     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5635         ctx++;
5636     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5637         ctx++;
5638
5639     if( h->slice_type == B_TYPE )
5640         ctx += 13;
5641     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5642 }
5643
5644 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5645     int mode = 0;
5646
5647     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5648         return pred_mode;
5649
5650     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5651     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5652     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5653
5654     if( mode >= pred_mode )
5655         return mode + 1;
5656     else
5657         return mode;
5658 }
5659
5660 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5661     const int mba_xy = h->left_mb_xy[0];
5662     const int mbb_xy = h->top_mb_xy;
5663
5664     int ctx = 0;
5665
5666     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5667     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5668         ctx++;
5669
5670     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5671         ctx++;
5672
5673     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5674         return 0;
5675
5676     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5677         return 1;
5678     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5679         return 2;
5680     else
5681         return 3;
5682 }
5683
5684 static const uint8_t block_idx_x[16] = {
5685     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5686 };
5687 static const uint8_t block_idx_y[16] = {
5688     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5689 };
5690 static const uint8_t block_idx_xy[4][4] = {
5691     { 0, 2, 8,  10},
5692     { 1, 3, 9,  11},
5693     { 4, 6, 12, 14},
5694     { 5, 7, 13, 15}
5695 };
5696
5697 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5698     int cbp = 0;
5699     int cbp_b = -1;
5700     int i8x8;
5701
5702     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5703         cbp_b = h->top_cbp;
5704         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5705     }
5706
5707     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5708         int cbp_a = -1;
5709         int x, y;
5710         int ctx = 0;
5711
5712         x = block_idx_x[4*i8x8];
5713         y = block_idx_y[4*i8x8];
5714
5715         if( x > 0 )
5716             cbp_a = cbp;
5717         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5718             cbp_a = h->left_cbp;
5719             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5720         }
5721
5722         if( y > 0 )
5723             cbp_b = cbp;
5724
5725         /* No need to test for skip as we put 0 for skip block */
5726         /* No need to test for IPCM as we put 1 for IPCM block */
5727         if( cbp_a >= 0 ) {
5728             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5729             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5730                 ctx++;
5731         }
5732
5733         if( cbp_b >= 0 ) {
5734             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5735             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5736                 ctx += 2;
5737         }
5738
5739         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5740             cbp |= 1 << i8x8;
5741         }
5742     }
5743     return cbp;
5744 }
5745 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5746     int ctx;
5747     int cbp_a, cbp_b;
5748
5749     cbp_a = (h->left_cbp>>4)&0x03;
5750     cbp_b = (h-> top_cbp>>4)&0x03;
5751
5752     ctx = 0;
5753     if( cbp_a > 0 ) ctx++;
5754     if( cbp_b > 0 ) ctx += 2;
5755     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5756         return 0;
5757
5758     ctx = 4;
5759     if( cbp_a == 2 ) ctx++;
5760     if( cbp_b == 2 ) ctx += 2;
5761     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5762 }
5763 static int decode_cabac_mb_dqp( H264Context *h) {
5764     MpegEncContext * const s = &h->s;
5765     int mbn_xy;
5766     int   ctx = 0;
5767     int   val = 0;
5768
5769     if( s->mb_x > 0 )
5770         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5771     else
5772         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5773
5774     if( h->last_qscale_diff != 0 )
5775         ctx++;
5776
5777     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5778         if( ctx < 2 )
5779             ctx = 2;
5780         else
5781             ctx = 3;
5782         val++;
5783         if(val > 102) //prevent infinite loop
5784             return INT_MIN;
5785     }
5786
5787     if( val&0x01 )
5788         return (val + 1)/2;
5789     else
5790         return -(val + 1)/2;
5791 }
5792 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5793     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5794         return 0;   /* 8x8 */
5795     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5796         return 1;   /* 8x4 */
5797     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5798         return 2;   /* 4x8 */
5799     return 3;       /* 4x4 */
5800 }
5801 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5802     int type;
5803     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5804         return 0;   /* B_Direct_8x8 */
5805     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5806         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5807     type = 3;
5808     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5809         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5810             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5811         type += 4;
5812     }
5813     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5814     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5815     return type;
5816 }
5817
5818 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5819     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5820 }
5821
5822 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5823     int refa = h->ref_cache[list][scan8[n] - 1];
5824     int refb = h->ref_cache[list][scan8[n] - 8];
5825     int ref  = 0;
5826     int ctx  = 0;
5827
5828     if( h->slice_type == B_TYPE) {
5829         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5830             ctx++;
5831         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5832             ctx += 2;
5833     } else {
5834         if( refa > 0 )
5835             ctx++;
5836         if( refb > 0 )
5837             ctx += 2;
5838     }
5839
5840     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5841         ref++;
5842         if( ctx < 4 )
5843             ctx = 4;
5844         else
5845             ctx = 5;
5846     }
5847     return ref;
5848 }
5849
5850 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5851     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5852                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5853     int ctxbase = (l == 0) ? 40 : 47;
5854     int ctx, mvd;
5855
5856     if( amvd < 3 )
5857         ctx = 0;
5858     else if( amvd > 32 )
5859         ctx = 2;
5860     else
5861         ctx = 1;
5862
5863     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5864         return 0;
5865
5866     mvd= 1;
5867     ctx= 3;
5868     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5869         mvd++;
5870         if( ctx < 6 )
5871             ctx++;
5872     }
5873
5874     if( mvd >= 9 ) {
5875         int k = 3;
5876         while( get_cabac_bypass( &h->cabac ) ) {
5877             mvd += 1 << k;
5878             k++;
5879         }
5880         while( k-- ) {
5881             if( get_cabac_bypass( &h->cabac ) )
5882                 mvd += 1 << k;
5883         }
5884     }
5885     return get_cabac_bypass_sign( &h->cabac, -mvd );
5886 }
5887
5888 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5889     int nza, nzb;
5890     int ctx = 0;
5891
5892     if( cat == 0 ) {
5893         nza = h->left_cbp&0x100;
5894         nzb = h-> top_cbp&0x100;
5895     } else if( cat == 1 || cat == 2 ) {
5896         nza = h->non_zero_count_cache[scan8[idx] - 1];
5897         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5898     } else if( cat == 3 ) {
5899         nza = (h->left_cbp>>(6+idx))&0x01;
5900         nzb = (h-> top_cbp>>(6+idx))&0x01;
5901     } else {
5902         assert(cat == 4);
5903         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5904         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5905     }
5906
5907     if( nza > 0 )
5908         ctx++;
5909
5910     if( nzb > 0 )
5911         ctx += 2;
5912
5913     return ctx + 4 * cat;
5914 }
5915
5916 static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
5917     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5918     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5919     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5920     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5921 };
5922
5923 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5924     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5925     static const int significant_coeff_flag_offset[2][6] = {
5926       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5927       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5928     };
5929     static const int last_coeff_flag_offset[2][6] = {
5930       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5931       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5932     };
5933     static const int coeff_abs_level_m1_offset[6] = {
5934         227+0, 227+10, 227+20, 227+30, 227+39, 426
5935     };
5936     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5937       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5938         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5939         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5940        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5941       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5942         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5943         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5944         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5945     };
5946
5947     int index[64];
5948
5949     int last;
5950     int coeff_count = 0;
5951
5952     int abslevel1 = 1;
5953     int abslevelgt1 = 0;
5954
5955     uint8_t *significant_coeff_ctx_base;
5956     uint8_t *last_coeff_ctx_base;
5957     uint8_t *abs_level_m1_ctx_base;
5958
5959 #ifndef ARCH_X86
5960 #define CABAC_ON_STACK
5961 #endif
5962 #ifdef CABAC_ON_STACK
5963 #define CC &cc
5964     CABACContext cc;
5965     cc.range     = h->cabac.range;
5966     cc.low       = h->cabac.low;
5967     cc.bytestream= h->cabac.bytestream;
5968 #else
5969 #define CC &h->cabac
5970 #endif
5971
5972
5973     /* cat: 0-> DC 16x16  n = 0
5974      *      1-> AC 16x16  n = luma4x4idx
5975      *      2-> Luma4x4   n = luma4x4idx
5976      *      3-> DC Chroma n = iCbCr
5977      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5978      *      5-> Luma8x8   n = 4 * luma8x8idx
5979      */
5980
5981     /* read coded block flag */
5982     if( cat != 5 ) {
5983         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5984             if( cat == 1 || cat == 2 )
5985                 h->non_zero_count_cache[scan8[n]] = 0;
5986             else if( cat == 4 )
5987                 h->non_zero_count_cache[scan8[16+n]] = 0;
5988 #ifdef CABAC_ON_STACK
5989             h->cabac.range     = cc.range     ;
5990             h->cabac.low       = cc.low       ;
5991             h->cabac.bytestream= cc.bytestream;
5992 #endif
5993             return 0;
5994         }
5995     }
5996
5997     significant_coeff_ctx_base = h->cabac_state
5998         + significant_coeff_flag_offset[MB_FIELD][cat];
5999     last_coeff_ctx_base = h->cabac_state
6000         + last_coeff_flag_offset[MB_FIELD][cat];
6001     abs_level_m1_ctx_base = h->cabac_state
6002         + coeff_abs_level_m1_offset[cat];
6003
6004     if( cat == 5 ) {
6005 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6006         for(last= 0; last < coefs; last++) { \
6007             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6008             if( get_cabac( CC, sig_ctx )) { \
6009                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6010                 index[coeff_count++] = last; \
6011                 if( get_cabac( CC, last_ctx ) ) { \
6012                     last= max_coeff; \
6013                     break; \
6014                 } \
6015             } \
6016         }\
6017         if( last == max_coeff -1 ) {\
6018             index[coeff_count++] = last;\
6019         }
6020         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6021 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
6022         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
6023     } else {
6024         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
6025 #else
6026         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6027     } else {
6028         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6029 #endif
6030     }
6031     assert(coeff_count > 0);
6032
6033     if( cat == 0 )
6034         h->cbp_table[mb_xy] |= 0x100;
6035     else if( cat == 1 || cat == 2 )
6036         h->non_zero_count_cache[scan8[n]] = coeff_count;
6037     else if( cat == 3 )
6038         h->cbp_table[mb_xy] |= 0x40 << n;
6039     else if( cat == 4 )
6040         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6041     else {
6042         assert( cat == 5 );
6043         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6044     }
6045
6046     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
6047         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6048         int j= scantable[index[coeff_count]];
6049
6050         if( get_cabac( CC, ctx ) == 0 ) {
6051             if( !qmul ) {
6052                 block[j] = get_cabac_bypass_sign( CC, -1);
6053             }else{
6054                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
6055             }
6056
6057             abslevel1++;
6058         } else {
6059             int coeff_abs = 2;
6060             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6061             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
6062                 coeff_abs++;
6063             }
6064
6065             if( coeff_abs >= 15 ) {
6066                 int j = 0;
6067                 while( get_cabac_bypass( CC ) ) {
6068                     j++;
6069                 }
6070
6071                 coeff_abs=1;
6072                 while( j-- ) {
6073                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
6074                 }
6075                 coeff_abs+= 14;
6076             }
6077
6078             if( !qmul ) {
6079                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
6080                 else                                block[j] =  coeff_abs;
6081             }else{
6082                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6083                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6084             }
6085
6086             abslevelgt1++;
6087         }
6088     }
6089 #ifdef CABAC_ON_STACK
6090             h->cabac.range     = cc.range     ;
6091             h->cabac.low       = cc.low       ;
6092             h->cabac.bytestream= cc.bytestream;
6093 #endif
6094     return 0;
6095 }
6096
6097 static void inline compute_mb_neighbors(H264Context *h)
6098 {
6099     MpegEncContext * const s = &h->s;
6100     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6101     h->top_mb_xy     = mb_xy - s->mb_stride;
6102     h->left_mb_xy[0] = mb_xy - 1;
6103     if(FRAME_MBAFF){
6104         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6105         const int top_pair_xy      = pair_xy     - s->mb_stride;
6106         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6107         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6108         const int curr_mb_frame_flag = !MB_FIELD;
6109         const int bottom = (s->mb_y & 1);
6110         if (bottom
6111                 ? !curr_mb_frame_flag // bottom macroblock
6112                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6113                 ) {
6114             h->top_mb_xy -= s->mb_stride;
6115         }
6116         if (left_mb_frame_flag != curr_mb_frame_flag) {
6117             h->left_mb_xy[0] = pair_xy - 1;
6118         }
6119     }
6120     return;
6121 }
6122
6123 /**
6124  * decodes a macroblock
6125  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6126  */
6127 static int decode_mb_cabac(H264Context *h) {
6128     MpegEncContext * const s = &h->s;
6129     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6130     int mb_type, partition_count, cbp = 0;
6131     int dct8x8_allowed= h->pps.transform_8x8_mode;
6132
6133     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6134
6135     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6136     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6137         int skip;
6138         /* a skipped mb needs the aff flag from the following mb */
6139         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6140             predict_field_decoding_flag(h);
6141         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6142             skip = h->next_mb_skipped;
6143         else
6144             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6145         /* read skip flags */
6146         if( skip ) {
6147             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6148                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6149                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6150                 if(h->next_mb_skipped)
6151                     predict_field_decoding_flag(h);
6152                 else
6153                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6154             }
6155
6156             decode_mb_skip(h);
6157
6158             h->cbp_table[mb_xy] = 0;
6159             h->chroma_pred_mode_table[mb_xy] = 0;
6160             h->last_qscale_diff = 0;
6161
6162             return 0;
6163
6164         }
6165     }
6166     if(FRAME_MBAFF){
6167         if( (s->mb_y&1) == 0 )
6168             h->mb_mbaff =
6169             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6170     }else
6171         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6172
6173     h->prev_mb_skipped = 0;
6174
6175     compute_mb_neighbors(h);
6176     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6177         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6178         return -1;
6179     }
6180
6181     if( h->slice_type == B_TYPE ) {
6182         if( mb_type < 23 ){
6183             partition_count= b_mb_type_info[mb_type].partition_count;
6184             mb_type=         b_mb_type_info[mb_type].type;
6185         }else{
6186             mb_type -= 23;
6187             goto decode_intra_mb;
6188         }
6189     } else if( h->slice_type == P_TYPE ) {
6190         if( mb_type < 5) {
6191             partition_count= p_mb_type_info[mb_type].partition_count;
6192             mb_type=         p_mb_type_info[mb_type].type;
6193         } else {
6194             mb_type -= 5;
6195             goto decode_intra_mb;
6196         }
6197     } else {
6198        assert(h->slice_type == I_TYPE);
6199 decode_intra_mb:
6200         partition_count = 0;
6201         cbp= i_mb_type_info[mb_type].cbp;
6202         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6203         mb_type= i_mb_type_info[mb_type].type;
6204     }
6205     if(MB_FIELD)
6206         mb_type |= MB_TYPE_INTERLACED;
6207
6208     h->slice_table[ mb_xy ]= h->slice_num;
6209
6210     if(IS_INTRA_PCM(mb_type)) {
6211         const uint8_t *ptr;
6212         unsigned int x, y;
6213
6214         // We assume these blocks are very rare so we dont optimize it.
6215         // FIXME The two following lines get the bitstream position in the cabac
6216         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6217         ptr= h->cabac.bytestream;
6218         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
6219
6220         // The pixels are stored in the same order as levels in h->mb array.
6221         for(y=0; y<16; y++){
6222             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6223             for(x=0; x<16; x++){
6224                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6225                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6226             }
6227         }
6228         for(y=0; y<8; y++){
6229             const int index= 256 + 4*(y&3) + 32*(y>>2);
6230             for(x=0; x<8; x++){
6231                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6232                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6233             }
6234         }
6235         for(y=0; y<8; y++){
6236             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6237             for(x=0; x<8; x++){
6238                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6239                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6240             }
6241         }
6242
6243         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6244
6245         // All blocks are present
6246         h->cbp_table[mb_xy] = 0x1ef;
6247         h->chroma_pred_mode_table[mb_xy] = 0;
6248         // In deblocking, the quantizer is 0
6249         s->current_picture.qscale_table[mb_xy]= 0;
6250         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6251         // All coeffs are present
6252         memset(h->non_zero_count[mb_xy], 16, 16);
6253         s->current_picture.mb_type[mb_xy]= mb_type;
6254         return 0;
6255     }
6256
6257     if(MB_MBAFF){
6258         h->ref_count[0] <<= 1;
6259         h->ref_count[1] <<= 1;
6260     }
6261
6262     fill_caches(h, mb_type, 0);
6263
6264     if( IS_INTRA( mb_type ) ) {
6265         int i;
6266         if( IS_INTRA4x4( mb_type ) ) {
6267             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6268                 mb_type |= MB_TYPE_8x8DCT;
6269                 for( i = 0; i < 16; i+=4 ) {
6270                     int pred = pred_intra_mode( h, i );
6271                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6272                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6273                 }
6274             } else {
6275                 for( i = 0; i < 16; i++ ) {
6276                     int pred = pred_intra_mode( h, i );
6277                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6278
6279                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6280                 }
6281             }
6282             write_back_intra_pred_mode(h);
6283             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6284         } else {
6285             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6286             if( h->intra16x16_pred_mode < 0 ) return -1;
6287         }
6288         h->chroma_pred_mode_table[mb_xy] =
6289             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
6290
6291         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
6292         if( h->chroma_pred_mode < 0 ) return -1;
6293     } else if( partition_count == 4 ) {
6294         int i, j, sub_partition_count[4], list, ref[2][4];
6295
6296         if( h->slice_type == B_TYPE ) {
6297             for( i = 0; i < 4; i++ ) {
6298                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6299                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6300                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6301             }
6302             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6303                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6304                 pred_direct_motion(h, &mb_type);
6305                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6306                     for( i = 0; i < 4; i++ )
6307                         if( IS_DIRECT(h->sub_mb_type[i]) )
6308                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6309                 }
6310             }
6311         } else {
6312             for( i = 0; i < 4; i++ ) {
6313                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6314                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6315                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6316             }
6317         }
6318
6319         for( list = 0; list < 2; list++ ) {
6320             if( h->ref_count[list] > 0 ) {
6321                 for( i = 0; i < 4; i++ ) {
6322                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6323                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6324                         if( h->ref_count[list] > 1 )
6325                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6326                         else
6327                             ref[list][i] = 0;
6328                     } else {
6329                         ref[list][i] = -1;
6330                     }
6331                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6332                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6333                 }
6334             }
6335         }
6336
6337         if(dct8x8_allowed)
6338             dct8x8_allowed = get_dct8x8_allowed(h);
6339
6340         for(list=0; list<2; list++){
6341             for(i=0; i<4; i++){
6342                 if(IS_DIRECT(h->sub_mb_type[i])){
6343                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6344                     continue;
6345                 }
6346                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6347
6348                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6349                     const int sub_mb_type= h->sub_mb_type[i];
6350                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6351                     for(j=0; j<sub_partition_count[i]; j++){
6352                         int mpx, mpy;
6353                         int mx, my;
6354                         const int index= 4*i + block_width*j;
6355                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6356                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6357                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6358
6359                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6360                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6361                         tprintf("final mv:%d %d\n", mx, my);
6362
6363                         if(IS_SUB_8X8(sub_mb_type)){
6364                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6365                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6366                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6367                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6368
6369                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6370                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6371                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6372                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6373                         }else if(IS_SUB_8X4(sub_mb_type)){
6374                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6375                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6376
6377                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6378                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6379                         }else if(IS_SUB_4X8(sub_mb_type)){
6380                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6381                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6382
6383                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6384                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6385                         }else{
6386                             assert(IS_SUB_4X4(sub_mb_type));
6387                             mv_cache[ 0 ][0]= mx;
6388                             mv_cache[ 0 ][1]= my;
6389
6390                             mvd_cache[ 0 ][0]= mx - mpx;
6391                             mvd_cache[ 0 ][1]= my - mpy;
6392                         }
6393                     }
6394                 }else{
6395                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6396                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6397                     p[0] = p[1] = p[8] = p[9] = 0;
6398                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6399                 }
6400             }
6401         }
6402     } else if( IS_DIRECT(mb_type) ) {
6403         pred_direct_motion(h, &mb_type);
6404         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6405         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6406         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6407     } else {
6408         int list, mx, my, i, mpx, mpy;
6409         if(IS_16X16(mb_type)){
6410             for(list=0; list<2; list++){
6411                 if(IS_DIR(mb_type, 0, list)){
6412                     if(h->ref_count[list] > 0 ){
6413                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6414                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6415                     }
6416                 }else
6417                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6418             }
6419             for(list=0; list<2; list++){
6420                 if(IS_DIR(mb_type, 0, list)){
6421                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6422
6423                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6424                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6425                     tprintf("final mv:%d %d\n", mx, my);
6426
6427                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6428                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6429                 }else
6430                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6431             }
6432         }
6433         else if(IS_16X8(mb_type)){
6434             for(list=0; list<2; list++){
6435                 if(h->ref_count[list]>0){
6436                     for(i=0; i<2; i++){
6437                         if(IS_DIR(mb_type, i, list)){
6438                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6439                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6440                         }else
6441                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6442                     }
6443                 }
6444             }
6445             for(list=0; list<2; list++){
6446                 for(i=0; i<2; i++){
6447                     if(IS_DIR(mb_type, i, list)){
6448                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6449                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6450                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6451                         tprintf("final mv:%d %d\n", mx, my);
6452
6453                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6454                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6455                     }else{
6456                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6457                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6458                     }
6459                 }
6460             }
6461         }else{
6462             assert(IS_8X16(mb_type));
6463             for(list=0; list<2; list++){
6464                 if(h->ref_count[list]>0){
6465                     for(i=0; i<2; i++){
6466                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6467                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6468                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6469                         }else
6470                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6471                     }
6472                 }
6473             }
6474             for(list=0; list<2; list++){
6475                 for(i=0; i<2; i++){
6476                     if(IS_DIR(mb_type, i, list)){
6477                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6478                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6479                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6480
6481                         tprintf("final mv:%d %d\n", mx, my);
6482                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6483                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6484                     }else{
6485                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6486                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6487                     }
6488                 }
6489             }
6490         }
6491     }
6492
6493    if( IS_INTER( mb_type ) ) {
6494         h->chroma_pred_mode_table[mb_xy] = 0;
6495         write_back_motion( h, mb_type );
6496    }
6497
6498     if( !IS_INTRA16x16( mb_type ) ) {
6499         cbp  = decode_cabac_mb_cbp_luma( h );
6500         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6501     }
6502
6503     h->cbp_table[mb_xy] = h->cbp = cbp;
6504
6505     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6506         if( decode_cabac_mb_transform_size( h ) )
6507             mb_type |= MB_TYPE_8x8DCT;
6508     }
6509     s->current_picture.mb_type[mb_xy]= mb_type;
6510
6511     if( cbp || IS_INTRA16x16( mb_type ) ) {
6512         const uint8_t *scan, *scan8x8, *dc_scan;
6513         int dqp;
6514
6515         if(IS_INTERLACED(mb_type)){
6516             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6517             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6518             dc_scan= luma_dc_field_scan;
6519         }else{
6520             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6521             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6522             dc_scan= luma_dc_zigzag_scan;
6523         }
6524
6525         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6526         if( dqp == INT_MIN ){
6527             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6528             return -1;
6529         }
6530         s->qscale += dqp;
6531         if(((unsigned)s->qscale) > 51){
6532             if(s->qscale<0) s->qscale+= 52;
6533             else            s->qscale-= 52;
6534         }
6535         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6536
6537         if( IS_INTRA16x16( mb_type ) ) {
6538             int i;
6539             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6540             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6541                 return -1;
6542             if( cbp&15 ) {
6543                 for( i = 0; i < 16; i++ ) {
6544                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6545                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6546                         return -1;
6547                 }
6548             } else {
6549                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6550             }
6551         } else {
6552             int i8x8, i4x4;
6553             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6554                 if( cbp & (1<<i8x8) ) {
6555                     if( IS_8x8DCT(mb_type) ) {
6556                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6557                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6558                             return -1;
6559                     } else
6560                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6561                         const int index = 4*i8x8 + i4x4;
6562                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6563 //START_TIMER
6564                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6565                             return -1;
6566 //STOP_TIMER("decode_residual")
6567                     }
6568                 } else {
6569                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6570                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6571                 }
6572             }
6573         }
6574
6575         if( cbp&0x30 ){
6576             int c;
6577             for( c = 0; c < 2; c++ ) {
6578                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6579                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6580                     return -1;
6581             }
6582         }
6583
6584         if( cbp&0x20 ) {
6585             int c, i;
6586             for( c = 0; c < 2; c++ ) {
6587                 for( i = 0; i < 4; i++ ) {
6588                     const int index = 16 + 4 * c + i;
6589                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6590                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6591                         return -1;
6592                 }
6593             }
6594         } else {
6595             uint8_t * const nnz= &h->non_zero_count_cache[0];
6596             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6597             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6598         }
6599     } else {
6600         uint8_t * const nnz= &h->non_zero_count_cache[0];
6601         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6602         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6603         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6604         h->last_qscale_diff = 0;
6605     }
6606
6607     s->current_picture.qscale_table[mb_xy]= s->qscale;
6608     write_back_non_zero_count(h);
6609
6610     if(MB_MBAFF){
6611         h->ref_count[0] >>= 1;
6612         h->ref_count[1] >>= 1;
6613     }
6614
6615     return 0;
6616 }
6617
6618
6619 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6620     int i, d;
6621     const int index_a = qp + h->slice_alpha_c0_offset;
6622     const int alpha = (alpha_table+52)[index_a];
6623     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6624
6625     if( bS[0] < 4 ) {
6626         int8_t tc[4];
6627         for(i=0; i<4; i++)
6628             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6629         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6630     } else {
6631         /* 16px edge length, because bS=4 is triggered by being at
6632          * the edge of an intra MB, so all 4 bS are the same */
6633             for( d = 0; d < 16; d++ ) {
6634                 const int p0 = pix[-1];
6635                 const int p1 = pix[-2];
6636                 const int p2 = pix[-3];
6637
6638                 const int q0 = pix[0];
6639                 const int q1 = pix[1];
6640                 const int q2 = pix[2];
6641
6642                 if( FFABS( p0 - q0 ) < alpha &&
6643                     FFABS( p1 - p0 ) < beta &&
6644                     FFABS( q1 - q0 ) < beta ) {
6645
6646                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6647                         if( FFABS( p2 - p0 ) < beta)
6648                         {
6649                             const int p3 = pix[-4];
6650                             /* p0', p1', p2' */
6651                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6652                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6653                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6654                         } else {
6655                             /* p0' */
6656                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6657                         }
6658                         if( FFABS( q2 - q0 ) < beta)
6659                         {
6660                             const int q3 = pix[3];
6661                             /* q0', q1', q2' */
6662                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6663                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6664                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6665                         } else {
6666                             /* q0' */
6667                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6668                         }
6669                     }else{
6670                         /* p0', q0' */
6671                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6672                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6673                     }
6674                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6675                 }
6676                 pix += stride;
6677             }
6678     }
6679 }
6680 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6681     int i;
6682     const int index_a = qp + h->slice_alpha_c0_offset;
6683     const int alpha = (alpha_table+52)[index_a];
6684     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6685
6686     if( bS[0] < 4 ) {
6687         int8_t tc[4];
6688         for(i=0; i<4; i++)
6689             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6690         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6691     } else {
6692         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6693     }
6694 }
6695
6696 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6697     int i;
6698     for( i = 0; i < 16; i++, pix += stride) {
6699         int index_a;
6700         int alpha;
6701         int beta;
6702
6703         int qp_index;
6704         int bS_index = (i >> 1);
6705         if (!MB_FIELD) {
6706             bS_index &= ~1;
6707             bS_index |= (i & 1);
6708         }
6709
6710         if( bS[bS_index] == 0 ) {
6711             continue;
6712         }
6713
6714         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6715         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6716         alpha = (alpha_table+52)[index_a];
6717         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6718
6719         if( bS[bS_index] < 4 ) {
6720             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6721             const int p0 = pix[-1];
6722             const int p1 = pix[-2];
6723             const int p2 = pix[-3];
6724             const int q0 = pix[0];
6725             const int q1 = pix[1];
6726             const int q2 = pix[2];
6727
6728             if( FFABS( p0 - q0 ) < alpha &&
6729                 FFABS( p1 - p0 ) < beta &&
6730                 FFABS( q1 - q0 ) < beta ) {
6731                 int tc = tc0;
6732                 int i_delta;
6733
6734                 if( FFABS( p2 - p0 ) < beta ) {
6735                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6736                     tc++;
6737                 }
6738                 if( FFABS( q2 - q0 ) < beta ) {
6739                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6740                     tc++;
6741                 }
6742
6743                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6744                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6745                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6746                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6747             }
6748         }else{
6749             const int p0 = pix[-1];
6750             const int p1 = pix[-2];
6751             const int p2 = pix[-3];
6752
6753             const int q0 = pix[0];
6754             const int q1 = pix[1];
6755             const int q2 = pix[2];
6756
6757             if( FFABS( p0 - q0 ) < alpha &&
6758                 FFABS( p1 - p0 ) < beta &&
6759                 FFABS( q1 - q0 ) < beta ) {
6760
6761                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6762                     if( FFABS( p2 - p0 ) < beta)
6763                     {
6764                         const int p3 = pix[-4];
6765                         /* p0', p1', p2' */
6766                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6767                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6768                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6769                     } else {
6770                         /* p0' */
6771                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6772                     }
6773                     if( FFABS( q2 - q0 ) < beta)
6774                     {
6775                         const int q3 = pix[3];
6776                         /* q0', q1', q2' */
6777                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6778                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6779                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6780                     } else {
6781                         /* q0' */
6782                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6783                     }
6784                 }else{
6785                     /* p0', q0' */
6786                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6787                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6788                 }
6789                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6790             }
6791         }
6792     }
6793 }
6794 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6795     int i;
6796     for( i = 0; i < 8; i++, pix += stride) {
6797         int index_a;
6798         int alpha;
6799         int beta;
6800
6801         int qp_index;
6802         int bS_index = i;
6803
6804         if( bS[bS_index] == 0 ) {
6805             continue;
6806         }
6807
6808         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6809         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6810         alpha = (alpha_table+52)[index_a];
6811         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6812
6813         if( bS[bS_index] < 4 ) {
6814             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6815             const int p0 = pix[-1];
6816             const int p1 = pix[-2];
6817             const int q0 = pix[0];
6818             const int q1 = pix[1];
6819
6820             if( FFABS( p0 - q0 ) < alpha &&
6821                 FFABS( p1 - p0 ) < beta &&
6822                 FFABS( q1 - q0 ) < beta ) {
6823                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6824
6825                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6826                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6827                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6828             }
6829         }else{
6830             const int p0 = pix[-1];
6831             const int p1 = pix[-2];
6832             const int q0 = pix[0];
6833             const int q1 = pix[1];
6834
6835             if( FFABS( p0 - q0 ) < alpha &&
6836                 FFABS( p1 - p0 ) < beta &&
6837                 FFABS( q1 - q0 ) < beta ) {
6838
6839                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6840                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6841                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6842             }
6843         }
6844     }
6845 }
6846
6847 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6848     int i, d;
6849     const int index_a = qp + h->slice_alpha_c0_offset;
6850     const int alpha = (alpha_table+52)[index_a];
6851     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6852     const int pix_next  = stride;
6853
6854     if( bS[0] < 4 ) {
6855         int8_t tc[4];
6856         for(i=0; i<4; i++)
6857             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6858         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6859     } else {
6860         /* 16px edge length, see filter_mb_edgev */
6861             for( d = 0; d < 16; d++ ) {
6862                 const int p0 = pix[-1*pix_next];
6863                 const int p1 = pix[-2*pix_next];
6864                 const int p2 = pix[-3*pix_next];
6865                 const int q0 = pix[0];
6866                 const int q1 = pix[1*pix_next];
6867                 const int q2 = pix[2*pix_next];
6868
6869                 if( FFABS( p0 - q0 ) < alpha &&
6870                     FFABS( p1 - p0 ) < beta &&
6871                     FFABS( q1 - q0 ) < beta ) {
6872
6873                     const int p3 = pix[-4*pix_next];
6874                     const int q3 = pix[ 3*pix_next];
6875
6876                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6877                         if( FFABS( p2 - p0 ) < beta) {
6878                             /* p0', p1', p2' */
6879                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6880                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6881                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6882                         } else {
6883                             /* p0' */
6884                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6885                         }
6886                         if( FFABS( q2 - q0 ) < beta) {
6887                             /* q0', q1', q2' */
6888                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6889                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6890                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6891                         } else {
6892                             /* q0' */
6893                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6894                         }
6895                     }else{
6896                         /* p0', q0' */
6897                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6898                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6899                     }
6900                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6901                 }
6902                 pix++;
6903             }
6904     }
6905 }
6906
6907 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6908     int i;
6909     const int index_a = qp + h->slice_alpha_c0_offset;
6910     const int alpha = (alpha_table+52)[index_a];
6911     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6912
6913     if( bS[0] < 4 ) {
6914         int8_t tc[4];
6915         for(i=0; i<4; i++)
6916             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6917         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6918     } else {
6919         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6920     }
6921 }
6922
6923 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6924     MpegEncContext * const s = &h->s;
6925     int mb_xy, mb_type;
6926     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6927
6928     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
6929         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6930         return;
6931     }
6932     assert(!FRAME_MBAFF);
6933
6934     mb_xy = mb_x + mb_y*s->mb_stride;
6935     mb_type = s->current_picture.mb_type[mb_xy];
6936     qp = s->current_picture.qscale_table[mb_xy];
6937     qp0 = s->current_picture.qscale_table[mb_xy-1];
6938     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6939     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
6940     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
6941     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
6942     qp0 = (qp + qp0 + 1) >> 1;
6943     qp1 = (qp + qp1 + 1) >> 1;
6944     qpc0 = (qpc + qpc0 + 1) >> 1;
6945     qpc1 = (qpc + qpc1 + 1) >> 1;
6946     qp_thresh = 15 - h->slice_alpha_c0_offset;
6947     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6948        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6949         return;
6950
6951     if( IS_INTRA(mb_type) ) {
6952         int16_t bS4[4] = {4,4,4,4};
6953         int16_t bS3[4] = {3,3,3,3};
6954         if( IS_8x8DCT(mb_type) ) {
6955             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6956             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6957             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6958             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6959         } else {
6960             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6961             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6962             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6963             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6964             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6965             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6966             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6967             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6968         }
6969         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6970         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6971         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6972         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6973         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6974         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6975         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6976         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6977         return;
6978     } else {
6979         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6980         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6981         int edges;
6982         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6983             edges = 4;
6984             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6985         } else {
6986             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6987                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6988             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6989                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6990                              ? 3 : 0;
6991             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6992             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6993             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6994                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6995         }
6996         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6997             bSv[0][0] = 0x0004000400040004ULL;
6998         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6999             bSv[1][0] = 0x0004000400040004ULL;
7000
7001 #define FILTER(hv,dir,edge)\
7002         if(bSv[dir][edge]) {\
7003             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
7004             if(!(edge&1)) {\
7005                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7006                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
7007             }\
7008         }
7009         if( edges == 1 ) {
7010             FILTER(v,0,0);
7011             FILTER(h,1,0);
7012         } else if( IS_8x8DCT(mb_type) ) {
7013             FILTER(v,0,0);
7014             FILTER(v,0,2);
7015             FILTER(h,1,0);
7016             FILTER(h,1,2);
7017         } else {
7018             FILTER(v,0,0);
7019             FILTER(v,0,1);
7020             FILTER(v,0,2);
7021             FILTER(v,0,3);
7022             FILTER(h,1,0);
7023             FILTER(h,1,1);
7024             FILTER(h,1,2);
7025             FILTER(h,1,3);
7026         }
7027 #undef FILTER
7028     }
7029 }
7030
7031 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7032     MpegEncContext * const s = &h->s;
7033     const int mb_xy= mb_x + mb_y*s->mb_stride;
7034     const int mb_type = s->current_picture.mb_type[mb_xy];
7035     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7036     int first_vertical_edge_done = 0;
7037     int dir;
7038     /* FIXME: A given frame may occupy more than one position in
7039      * the reference list. So ref2frm should be populated with
7040      * frame numbers, not indices. */
7041     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7042                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7043
7044     //for sufficiently low qp, filtering wouldn't do anything
7045     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7046     if(!FRAME_MBAFF){
7047         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7048         int qp = s->current_picture.qscale_table[mb_xy];
7049         if(qp <= qp_thresh
7050            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7051            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7052             return;
7053         }
7054     }
7055
7056     if (FRAME_MBAFF
7057             // left mb is in picture
7058             && h->slice_table[mb_xy-1] != 255
7059             // and current and left pair do not have the same interlaced type
7060             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7061             // and left mb is in the same slice if deblocking_filter == 2
7062             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7063         /* First vertical edge is different in MBAFF frames
7064          * There are 8 different bS to compute and 2 different Qp
7065          */
7066         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7067         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7068         int16_t bS[8];
7069         int qp[2];
7070         int chroma_qp[2];
7071         int mb_qp, mbn0_qp, mbn1_qp;
7072         int i;
7073         first_vertical_edge_done = 1;
7074
7075         if( IS_INTRA(mb_type) )
7076             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7077         else {
7078             for( i = 0; i < 8; i++ ) {
7079                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7080
7081                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7082                     bS[i] = 4;
7083                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7084                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7085                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7086                     bS[i] = 2;
7087                 else
7088                     bS[i] = 1;
7089             }
7090         }
7091
7092         mb_qp = s->current_picture.qscale_table[mb_xy];
7093         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7094         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7095         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7096         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7097                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7098         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7099         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7100                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7101
7102         /* Filter edge */
7103         tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7104         { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7105         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7106         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7107         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7108     }
7109     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7110     for( dir = 0; dir < 2; dir++ )
7111     {
7112         int edge;
7113         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7114         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7115         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7116
7117         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7118                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7119         // how often to recheck mv-based bS when iterating between edges
7120         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7121                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7122         // how often to recheck mv-based bS when iterating along each edge
7123         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7124
7125         if (first_vertical_edge_done) {
7126             start = 1;
7127             first_vertical_edge_done = 0;
7128         }
7129
7130         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7131             start = 1;
7132
7133         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7134             && !IS_INTERLACED(mb_type)
7135             && IS_INTERLACED(mbm_type)
7136             ) {
7137             // This is a special case in the norm where the filtering must
7138             // be done twice (one each of the field) even if we are in a
7139             // frame macroblock.
7140             //
7141             static const int nnz_idx[4] = {4,5,6,3};
7142             unsigned int tmp_linesize   = 2 *   linesize;
7143             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7144             int mbn_xy = mb_xy - 2 * s->mb_stride;
7145             int qp, chroma_qp;
7146             int i, j;
7147             int16_t bS[4];
7148
7149             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7150                 if( IS_INTRA(mb_type) ||
7151                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7152                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7153                 } else {
7154                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7155                     for( i = 0; i < 4; i++ ) {
7156                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7157                             mbn_nnz[nnz_idx[i]] != 0 )
7158                             bS[i] = 2;
7159                         else
7160                             bS[i] = 1;
7161                     }
7162                 }
7163                 // Do not use s->qscale as luma quantizer because it has not the same
7164                 // value in IPCM macroblocks.
7165                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7166                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7167                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7168                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7169                 chroma_qp = ( h->chroma_qp +
7170                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7171                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7172                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7173             }
7174
7175             start = 1;
7176         }
7177
7178         /* Calculate bS */
7179         for( edge = start; edge < edges; edge++ ) {
7180             /* mbn_xy: neighbor macroblock */
7181             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7182             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7183             int16_t bS[4];
7184             int qp;
7185
7186             if( (edge&1) && IS_8x8DCT(mb_type) )
7187                 continue;
7188
7189             if( IS_INTRA(mb_type) ||
7190                 IS_INTRA(mbn_type) ) {
7191                 int value;
7192                 if (edge == 0) {
7193                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7194                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7195                     ) {
7196                         value = 4;
7197                     } else {
7198                         value = 3;
7199                     }
7200                 } else {
7201                     value = 3;
7202                 }
7203                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7204             } else {
7205                 int i, l;
7206                 int mv_done;
7207
7208                 if( edge & mask_edge ) {
7209                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7210                     mv_done = 1;
7211                 }
7212                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7213                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7214                     mv_done = 1;
7215                 }
7216                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7217                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7218                     int bn_idx= b_idx - (dir ? 8:1);
7219                     int v = 0;
7220                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7221                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7222                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7223                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7224                     }
7225                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7226                     mv_done = 1;
7227                 }
7228                 else
7229                     mv_done = 0;
7230
7231                 for( i = 0; i < 4; i++ ) {
7232                     int x = dir == 0 ? edge : i;
7233                     int y = dir == 0 ? i    : edge;
7234                     int b_idx= 8 + 4 + x + 8*y;
7235                     int bn_idx= b_idx - (dir ? 8:1);
7236
7237                     if( h->non_zero_count_cache[b_idx] != 0 ||
7238                         h->non_zero_count_cache[bn_idx] != 0 ) {
7239                         bS[i] = 2;
7240                     }
7241                     else if(!mv_done)
7242                     {
7243                         bS[i] = 0;
7244                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7245                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7246                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7247                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7248                                 bS[i] = 1;
7249                                 break;
7250                             }
7251                         }
7252                     }
7253                 }
7254
7255                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7256                     continue;
7257             }
7258
7259             /* Filter edge */
7260             // Do not use s->qscale as luma quantizer because it has not the same
7261             // value in IPCM macroblocks.
7262             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7263             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7264             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7265             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7266             if( dir == 0 ) {
7267                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7268                 if( (edge&1) == 0 ) {
7269                     int chroma_qp = ( h->chroma_qp +
7270                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7271                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7272                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7273                 }
7274             } else {
7275                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7276                 if( (edge&1) == 0 ) {
7277                     int chroma_qp = ( h->chroma_qp +
7278                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7279                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7280                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7281                 }
7282             }
7283         }
7284     }
7285 }
7286
7287 static int decode_slice(H264Context *h){
7288     MpegEncContext * const s = &h->s;
7289     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7290
7291     s->mb_skip_run= -1;
7292
7293     if( h->pps.cabac ) {
7294         int i;
7295
7296         /* realign */
7297         align_get_bits( &s->gb );
7298
7299         /* init cabac */
7300         ff_init_cabac_states( &h->cabac);
7301         ff_init_cabac_decoder( &h->cabac,
7302                                s->gb.buffer + get_bits_count(&s->gb)/8,
7303                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7304         /* calculate pre-state */
7305         for( i= 0; i < 460; i++ ) {
7306             int pre;
7307             if( h->slice_type == I_TYPE )
7308                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7309             else
7310                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7311
7312             if( pre <= 63 )
7313                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7314             else
7315                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7316         }
7317
7318         for(;;){
7319 //START_TIMER
7320             int ret = decode_mb_cabac(h);
7321             int eos;
7322 //STOP_TIMER("decode_mb_cabac")
7323
7324             if(ret>=0) hl_decode_mb(h);
7325
7326             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7327                 s->mb_y++;
7328
7329                 if(ret>=0) ret = decode_mb_cabac(h);
7330
7331                 if(ret>=0) hl_decode_mb(h);
7332                 s->mb_y--;
7333             }
7334             eos = get_cabac_terminate( &h->cabac );
7335
7336             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7337                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7338                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7339                 return -1;
7340             }
7341
7342             if( ++s->mb_x >= s->mb_width ) {
7343                 s->mb_x = 0;
7344                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7345                 ++s->mb_y;
7346                 if(FRAME_MBAFF) {
7347                     ++s->mb_y;
7348                 }
7349             }
7350
7351             if( eos || s->mb_y >= s->mb_height ) {
7352                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7353                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7354                 return 0;
7355             }
7356         }
7357
7358     } else {
7359         for(;;){
7360             int ret = decode_mb_cavlc(h);
7361
7362             if(ret>=0) hl_decode_mb(h);
7363
7364             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7365                 s->mb_y++;
7366                 ret = decode_mb_cavlc(h);
7367
7368                 if(ret>=0) hl_decode_mb(h);
7369                 s->mb_y--;
7370             }
7371
7372             if(ret<0){
7373                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7374                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7375
7376                 return -1;
7377             }
7378
7379             if(++s->mb_x >= s->mb_width){
7380                 s->mb_x=0;
7381                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7382                 ++s->mb_y;
7383                 if(FRAME_MBAFF) {
7384                     ++s->mb_y;
7385                 }
7386                 if(s->mb_y >= s->mb_height){
7387                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7388
7389                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7390                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7391
7392                         return 0;
7393                     }else{
7394                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7395
7396                         return -1;
7397                     }
7398                 }
7399             }
7400
7401             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7402                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7403                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7404                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7405
7406                     return 0;
7407                 }else{
7408                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7409
7410                     return -1;
7411                 }
7412             }
7413         }
7414     }
7415
7416 #if 0
7417     for(;s->mb_y < s->mb_height; s->mb_y++){
7418         for(;s->mb_x < s->mb_width; s->mb_x++){
7419             int ret= decode_mb(h);
7420
7421             hl_decode_mb(h);
7422
7423             if(ret<0){
7424                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7425                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7426
7427                 return -1;
7428             }
7429
7430             if(++s->mb_x >= s->mb_width){
7431                 s->mb_x=0;
7432                 if(++s->mb_y >= s->mb_height){
7433                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7434                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7435
7436                         return 0;
7437                     }else{
7438                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7439
7440                         return -1;
7441                     }
7442                 }
7443             }
7444
7445             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7446                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7447                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7448
7449                     return 0;
7450                 }else{
7451                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7452
7453                     return -1;
7454                 }
7455             }
7456         }
7457         s->mb_x=0;
7458         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7459     }
7460 #endif
7461     return -1; //not reached
7462 }
7463
7464 static int decode_unregistered_user_data(H264Context *h, int size){
7465     MpegEncContext * const s = &h->s;
7466     uint8_t user_data[16+256];
7467     int e, build, i;
7468
7469     if(size<16)
7470         return -1;
7471
7472     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7473         user_data[i]= get_bits(&s->gb, 8);
7474     }
7475
7476     user_data[i]= 0;
7477     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7478     if(e==1 && build>=0)
7479         h->x264_build= build;
7480
7481     if(s->avctx->debug & FF_DEBUG_BUGS)
7482         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7483
7484     for(; i<size; i++)
7485         skip_bits(&s->gb, 8);
7486
7487     return 0;
7488 }
7489
7490 static int decode_sei(H264Context *h){
7491     MpegEncContext * const s = &h->s;
7492
7493     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7494         int size, type;
7495
7496         type=0;
7497         do{
7498             type+= show_bits(&s->gb, 8);
7499         }while(get_bits(&s->gb, 8) == 255);
7500
7501         size=0;
7502         do{
7503             size+= show_bits(&s->gb, 8);
7504         }while(get_bits(&s->gb, 8) == 255);
7505
7506         switch(type){
7507         case 5:
7508             if(decode_unregistered_user_data(h, size) < 0)
7509                 return -1;
7510             break;
7511         default:
7512             skip_bits(&s->gb, 8*size);
7513         }
7514
7515         //FIXME check bits here
7516         align_get_bits(&s->gb);
7517     }
7518
7519     return 0;
7520 }
7521
7522 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7523     MpegEncContext * const s = &h->s;
7524     int cpb_count, i;
7525     cpb_count = get_ue_golomb(&s->gb) + 1;
7526     get_bits(&s->gb, 4); /* bit_rate_scale */
7527     get_bits(&s->gb, 4); /* cpb_size_scale */
7528     for(i=0; i<cpb_count; i++){
7529         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7530         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7531         get_bits1(&s->gb);     /* cbr_flag */
7532     }
7533     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7534     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7535     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7536     get_bits(&s->gb, 5); /* time_offset_length */
7537 }
7538
7539 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7540     MpegEncContext * const s = &h->s;
7541     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7542     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7543
7544     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7545
7546     if( aspect_ratio_info_present_flag ) {
7547         aspect_ratio_idc= get_bits(&s->gb, 8);
7548         if( aspect_ratio_idc == EXTENDED_SAR ) {
7549             sps->sar.num= get_bits(&s->gb, 16);
7550             sps->sar.den= get_bits(&s->gb, 16);
7551         }else if(aspect_ratio_idc < 14){
7552             sps->sar=  pixel_aspect[aspect_ratio_idc];
7553         }else{
7554             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7555             return -1;
7556         }
7557     }else{
7558         sps->sar.num=
7559         sps->sar.den= 0;
7560     }
7561 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7562
7563     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7564         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7565     }
7566
7567     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7568         get_bits(&s->gb, 3);    /* video_format */
7569         get_bits1(&s->gb);      /* video_full_range_flag */
7570         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7571             get_bits(&s->gb, 8); /* colour_primaries */
7572             get_bits(&s->gb, 8); /* transfer_characteristics */
7573             get_bits(&s->gb, 8); /* matrix_coefficients */
7574         }
7575     }
7576
7577     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7578         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7579         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7580     }
7581
7582     sps->timing_info_present_flag = get_bits1(&s->gb);
7583     if(sps->timing_info_present_flag){
7584         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7585         sps->time_scale = get_bits_long(&s->gb, 32);
7586         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7587     }
7588
7589     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7590     if(nal_hrd_parameters_present_flag)
7591         decode_hrd_parameters(h, sps);
7592     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7593     if(vcl_hrd_parameters_present_flag)
7594         decode_hrd_parameters(h, sps);
7595     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7596         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7597     get_bits1(&s->gb);         /* pic_struct_present_flag */
7598
7599     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7600     if(sps->bitstream_restriction_flag){
7601         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7602         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7603         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7604         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7605         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7606         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7607         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7608     }
7609
7610     return 0;
7611 }
7612
7613 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7614                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7615     MpegEncContext * const s = &h->s;
7616     int i, last = 8, next = 8;
7617     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7618     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7619         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7620     else
7621     for(i=0;i<size;i++){
7622         if(next)
7623             next = (last + get_se_golomb(&s->gb)) & 0xff;
7624         if(!i && !next){ /* matrix not written, we use the preset one */
7625             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7626             break;
7627         }
7628         last = factors[scan[i]] = next ? next : last;
7629     }
7630 }
7631
7632 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7633                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7634     MpegEncContext * const s = &h->s;
7635     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7636     const uint8_t *fallback[4] = {
7637         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7638         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7639         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7640         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7641     };
7642     if(get_bits1(&s->gb)){
7643         sps->scaling_matrix_present |= is_sps;
7644         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7645         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7646         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7647         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7648         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7649         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7650         if(is_sps || pps->transform_8x8_mode){
7651             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7652             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7653         }
7654     } else if(fallback_sps) {
7655         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7656         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7657     }
7658 }
7659
7660 static inline int decode_seq_parameter_set(H264Context *h){
7661     MpegEncContext * const s = &h->s;
7662     int profile_idc, level_idc;
7663     int sps_id, i;
7664     SPS *sps;
7665
7666     profile_idc= get_bits(&s->gb, 8);
7667     get_bits1(&s->gb);   //constraint_set0_flag
7668     get_bits1(&s->gb);   //constraint_set1_flag
7669     get_bits1(&s->gb);   //constraint_set2_flag
7670     get_bits1(&s->gb);   //constraint_set3_flag
7671     get_bits(&s->gb, 4); // reserved
7672     level_idc= get_bits(&s->gb, 8);
7673     sps_id= get_ue_golomb(&s->gb);
7674
7675     sps= &h->sps_buffer[ sps_id ];
7676     sps->profile_idc= profile_idc;
7677     sps->level_idc= level_idc;
7678
7679     if(sps->profile_idc >= 100){ //high profile
7680         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7681             get_bits1(&s->gb);  //residual_color_transform_flag
7682         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7683         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7684         sps->transform_bypass = get_bits1(&s->gb);
7685         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7686     }else
7687         sps->scaling_matrix_present = 0;
7688
7689     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7690     sps->poc_type= get_ue_golomb(&s->gb);
7691
7692     if(sps->poc_type == 0){ //FIXME #define
7693         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7694     } else if(sps->poc_type == 1){//FIXME #define
7695         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7696         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7697         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7698         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7699
7700         for(i=0; i<sps->poc_cycle_length; i++)
7701             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7702     }
7703     if(sps->poc_type > 2){
7704         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7705         return -1;
7706     }
7707
7708     sps->ref_frame_count= get_ue_golomb(&s->gb);
7709     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7710         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7711     }
7712     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7713     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7714     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7715     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7716        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7717         return -1;
7718
7719     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7720     if(!sps->frame_mbs_only_flag)
7721         sps->mb_aff= get_bits1(&s->gb);
7722     else
7723         sps->mb_aff= 0;
7724
7725     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7726
7727 #ifndef ALLOW_INTERLACE
7728     if(sps->mb_aff)
7729         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7730 #endif
7731     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7732         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7733
7734     sps->crop= get_bits1(&s->gb);
7735     if(sps->crop){
7736         sps->crop_left  = get_ue_golomb(&s->gb);
7737         sps->crop_right = get_ue_golomb(&s->gb);
7738         sps->crop_top   = get_ue_golomb(&s->gb);
7739         sps->crop_bottom= get_ue_golomb(&s->gb);
7740         if(sps->crop_left || sps->crop_top){
7741             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7742         }
7743     }else{
7744         sps->crop_left  =
7745         sps->crop_right =
7746         sps->crop_top   =
7747         sps->crop_bottom= 0;
7748     }
7749
7750     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7751     if( sps->vui_parameters_present_flag )
7752         decode_vui_parameters(h, sps);
7753
7754     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7755         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7756                sps_id, sps->profile_idc, sps->level_idc,
7757                sps->poc_type,
7758                sps->ref_frame_count,
7759                sps->mb_width, sps->mb_height,
7760                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7761                sps->direct_8x8_inference_flag ? "8B8" : "",
7762                sps->crop_left, sps->crop_right,
7763                sps->crop_top, sps->crop_bottom,
7764                sps->vui_parameters_present_flag ? "VUI" : ""
7765                );
7766     }
7767     return 0;
7768 }
7769
7770 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7771     MpegEncContext * const s = &h->s;
7772     int pps_id= get_ue_golomb(&s->gb);
7773     PPS *pps= &h->pps_buffer[pps_id];
7774
7775     pps->sps_id= get_ue_golomb(&s->gb);
7776     pps->cabac= get_bits1(&s->gb);
7777     pps->pic_order_present= get_bits1(&s->gb);
7778     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7779     if(pps->slice_group_count > 1 ){
7780         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7781         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7782         switch(pps->mb_slice_group_map_type){
7783         case 0:
7784 #if 0
7785 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7786 |    run_length[ i ]                                |1  |ue(v)   |
7787 #endif
7788             break;
7789         case 2:
7790 #if 0
7791 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7792 |{                                                  |   |        |
7793 |    top_left_mb[ i ]                               |1  |ue(v)   |
7794 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7795 |   }                                               |   |        |
7796 #endif
7797             break;
7798         case 3:
7799         case 4:
7800         case 5:
7801 #if 0
7802 |   slice_group_change_direction_flag               |1  |u(1)    |
7803 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7804 #endif
7805             break;
7806         case 6:
7807 #if 0
7808 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7809 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7810 |)                                                  |   |        |
7811 |    slice_group_id[ i ]                            |1  |u(v)    |
7812 #endif
7813             break;
7814         }
7815     }
7816     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7817     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7818     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7819         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7820         return -1;
7821     }
7822
7823     pps->weighted_pred= get_bits1(&s->gb);
7824     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7825     pps->init_qp= get_se_golomb(&s->gb) + 26;
7826     pps->init_qs= get_se_golomb(&s->gb) + 26;
7827     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7828     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7829     pps->constrained_intra_pred= get_bits1(&s->gb);
7830     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7831
7832     pps->transform_8x8_mode= 0;
7833     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7834     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7835     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7836
7837     if(get_bits_count(&s->gb) < bit_length){
7838         pps->transform_8x8_mode= get_bits1(&s->gb);
7839         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7840         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7841     }
7842
7843     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7844         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7845                pps_id, pps->sps_id,
7846                pps->cabac ? "CABAC" : "CAVLC",
7847                pps->slice_group_count,
7848                pps->ref_count[0], pps->ref_count[1],
7849                pps->weighted_pred ? "weighted" : "",
7850                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7851                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7852                pps->constrained_intra_pred ? "CONSTR" : "",
7853                pps->redundant_pic_cnt_present ? "REDU" : "",
7854                pps->transform_8x8_mode ? "8x8DCT" : ""
7855                );
7856     }
7857
7858     return 0;
7859 }
7860
7861 /**
7862  * finds the end of the current frame in the bitstream.
7863  * @return the position of the first byte of the next frame, or -1
7864  */
7865 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7866     int i;
7867     uint32_t state;
7868     ParseContext *pc = &(h->s.parse_context);
7869 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7870 //    mb_addr= pc->mb_addr - 1;
7871     state= pc->state;
7872     for(i=0; i<=buf_size; i++){
7873         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7874             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7875             if(pc->frame_start_found){
7876                 // If there isn't one more byte in the buffer
7877                 // the test on first_mb_in_slice cannot be done yet
7878                 // do it at next call.
7879                 if (i >= buf_size) break;
7880                 if (buf[i] & 0x80) {
7881                     // first_mb_in_slice is 0, probably the first nal of a new
7882                     // slice
7883                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7884                     pc->state=-1;
7885                     pc->frame_start_found= 0;
7886                     return i-4;
7887                 }
7888             }
7889             pc->frame_start_found = 1;
7890         }
7891         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7892            if(pc->frame_start_found){
7893                 pc->state=-1;
7894                 pc->frame_start_found= 0;
7895                 return i-4;
7896            }
7897         }
7898         if (i<buf_size)
7899             state= (state<<8) | buf[i];
7900     }
7901
7902     pc->state= state;
7903     return END_NOT_FOUND;
7904 }
7905
7906 #ifdef CONFIG_H264_PARSER
7907 static int h264_parse(AVCodecParserContext *s,
7908                       AVCodecContext *avctx,
7909                       uint8_t **poutbuf, int *poutbuf_size,
7910                       const uint8_t *buf, int buf_size)
7911 {
7912     H264Context *h = s->priv_data;
7913     ParseContext *pc = &h->s.parse_context;
7914     int next;
7915
7916     next= find_frame_end(h, buf, buf_size);
7917
7918     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7919         *poutbuf = NULL;
7920         *poutbuf_size = 0;
7921         return buf_size;
7922     }
7923
7924     *poutbuf = (uint8_t *)buf;
7925     *poutbuf_size = buf_size;
7926     return next;
7927 }
7928
7929 static int h264_split(AVCodecContext *avctx,
7930                       const uint8_t *buf, int buf_size)
7931 {
7932     int i;
7933     uint32_t state = -1;
7934     int has_sps= 0;
7935
7936     for(i=0; i<=buf_size; i++){
7937         if((state&0xFFFFFF1F) == 0x107)
7938             has_sps=1;
7939 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7940         }*/
7941         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7942             if(has_sps){
7943                 while(i>4 && buf[i-5]==0) i--;
7944                 return i-4;
7945             }
7946         }
7947         if (i<buf_size)
7948             state= (state<<8) | buf[i];
7949     }
7950     return 0;
7951 }
7952 #endif /* CONFIG_H264_PARSER */
7953
7954 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7955     MpegEncContext * const s = &h->s;
7956     AVCodecContext * const avctx= s->avctx;
7957     int buf_index=0;
7958 #if 0
7959     int i;
7960     for(i=0; i<50; i++){
7961         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7962     }
7963 #endif
7964     h->slice_num = 0;
7965     s->current_picture_ptr= NULL;
7966     for(;;){
7967         int consumed;
7968         int dst_length;
7969         int bit_length;
7970         uint8_t *ptr;
7971         int i, nalsize = 0;
7972
7973       if(h->is_avc) {
7974         if(buf_index >= buf_size) break;
7975         nalsize = 0;
7976         for(i = 0; i < h->nal_length_size; i++)
7977             nalsize = (nalsize << 8) | buf[buf_index++];
7978         if(nalsize <= 1){
7979             if(nalsize == 1){
7980                 buf_index++;
7981                 continue;
7982             }else{
7983                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7984                 break;
7985             }
7986         }
7987       } else {
7988         // start code prefix search
7989         for(; buf_index + 3 < buf_size; buf_index++){
7990             // this should allways succeed in the first iteration
7991             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7992                 break;
7993         }
7994
7995         if(buf_index+3 >= buf_size) break;
7996
7997         buf_index+=3;
7998       }
7999
8000         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
8001         while(ptr[dst_length - 1] == 0 && dst_length > 1)
8002             dst_length--;
8003         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
8004
8005         if(s->avctx->debug&FF_DEBUG_STARTCODE){
8006             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
8007         }
8008
8009         if (h->is_avc && (nalsize != consumed))
8010             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
8011
8012         buf_index += consumed;
8013
8014         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
8015            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
8016             continue;
8017
8018         switch(h->nal_unit_type){
8019         case NAL_IDR_SLICE:
8020             idr(h); //FIXME ensure we don't loose some frames if there is reordering
8021         case NAL_SLICE:
8022             init_get_bits(&s->gb, ptr, bit_length);
8023             h->intra_gb_ptr=
8024             h->inter_gb_ptr= &s->gb;
8025             s->data_partitioning = 0;
8026
8027             if(decode_slice_header(h) < 0){
8028                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8029                 break;
8030             }
8031             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8032             if(h->redundant_pic_count==0 && s->hurry_up < 5
8033                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8034                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8035                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8036                && avctx->skip_frame < AVDISCARD_ALL)
8037                 decode_slice(h);
8038             break;
8039         case NAL_DPA:
8040             init_get_bits(&s->gb, ptr, bit_length);
8041             h->intra_gb_ptr=
8042             h->inter_gb_ptr= NULL;
8043             s->data_partitioning = 1;
8044
8045             if(decode_slice_header(h) < 0){
8046                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8047             }
8048             break;
8049         case NAL_DPB:
8050             init_get_bits(&h->intra_gb, ptr, bit_length);
8051             h->intra_gb_ptr= &h->intra_gb;
8052             break;
8053         case NAL_DPC:
8054             init_get_bits(&h->inter_gb, ptr, bit_length);
8055             h->inter_gb_ptr= &h->inter_gb;
8056
8057             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8058                && s->hurry_up < 5
8059                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8060                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8061                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8062                && avctx->skip_frame < AVDISCARD_ALL)
8063                 decode_slice(h);
8064             break;
8065         case NAL_SEI:
8066             init_get_bits(&s->gb, ptr, bit_length);
8067             decode_sei(h);
8068             break;
8069         case NAL_SPS:
8070             init_get_bits(&s->gb, ptr, bit_length);
8071             decode_seq_parameter_set(h);
8072
8073             if(s->flags& CODEC_FLAG_LOW_DELAY)
8074                 s->low_delay=1;
8075
8076             if(avctx->has_b_frames < 2)
8077                 avctx->has_b_frames= !s->low_delay;
8078             break;
8079         case NAL_PPS:
8080             init_get_bits(&s->gb, ptr, bit_length);
8081
8082             decode_picture_parameter_set(h, bit_length);
8083
8084             break;
8085         case NAL_AUD:
8086         case NAL_END_SEQUENCE:
8087         case NAL_END_STREAM:
8088         case NAL_FILLER_DATA:
8089         case NAL_SPS_EXT:
8090         case NAL_AUXILIARY_SLICE:
8091             break;
8092         default:
8093             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8094         }
8095     }
8096
8097     if(!s->current_picture_ptr) return buf_index; //no frame
8098
8099     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8100     s->current_picture_ptr->pict_type= s->pict_type;
8101
8102     h->prev_frame_num_offset= h->frame_num_offset;
8103     h->prev_frame_num= h->frame_num;
8104     if(s->current_picture_ptr->reference){
8105         h->prev_poc_msb= h->poc_msb;
8106         h->prev_poc_lsb= h->poc_lsb;
8107     }
8108     if(s->current_picture_ptr->reference)
8109         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8110
8111     ff_er_frame_end(s);
8112
8113     MPV_frame_end(s);
8114
8115     return buf_index;
8116 }
8117
8118 /**
8119  * returns the number of bytes consumed for building the current frame
8120  */
8121 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8122     if(s->flags&CODEC_FLAG_TRUNCATED){
8123         pos -= s->parse_context.last_index;
8124         if(pos<0) pos=0; // FIXME remove (unneeded?)
8125
8126         return pos;
8127     }else{
8128         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8129         if(pos+10>buf_size) pos=buf_size; // oops ;)
8130
8131         return pos;
8132     }
8133 }
8134
8135 static int decode_frame(AVCodecContext *avctx,
8136                              void *data, int *data_size,
8137                              uint8_t *buf, int buf_size)
8138 {
8139     H264Context *h = avctx->priv_data;
8140     MpegEncContext *s = &h->s;
8141     AVFrame *pict = data;
8142     int buf_index;
8143
8144     s->flags= avctx->flags;
8145     s->flags2= avctx->flags2;
8146
8147    /* no supplementary picture */
8148     if (buf_size == 0) {
8149         return 0;
8150     }
8151
8152     if(s->flags&CODEC_FLAG_TRUNCATED){
8153         int next= find_frame_end(h, buf, buf_size);
8154
8155         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8156             return buf_size;
8157 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8158     }
8159
8160     if(h->is_avc && !h->got_avcC) {
8161         int i, cnt, nalsize;
8162         unsigned char *p = avctx->extradata;
8163         if(avctx->extradata_size < 7) {
8164             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8165             return -1;
8166         }
8167         if(*p != 1) {
8168             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8169             return -1;
8170         }
8171         /* sps and pps in the avcC always have length coded with 2 bytes,
8172            so put a fake nal_length_size = 2 while parsing them */
8173         h->nal_length_size = 2;
8174         // Decode sps from avcC
8175         cnt = *(p+5) & 0x1f; // Number of sps
8176         p += 6;
8177         for (i = 0; i < cnt; i++) {
8178             nalsize = BE_16(p) + 2;
8179             if(decode_nal_units(h, p, nalsize) < 0) {
8180                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8181                 return -1;
8182             }
8183             p += nalsize;
8184         }
8185         // Decode pps from avcC
8186         cnt = *(p++); // Number of pps
8187         for (i = 0; i < cnt; i++) {
8188             nalsize = BE_16(p) + 2;
8189             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8190                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8191                 return -1;
8192             }
8193             p += nalsize;
8194         }
8195         // Now store right nal length size, that will be use to parse all other nals
8196         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8197         // Do not reparse avcC
8198         h->got_avcC = 1;
8199     }
8200
8201     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
8202         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8203             return -1;
8204     }
8205
8206     buf_index=decode_nal_units(h, buf, buf_size);
8207     if(buf_index < 0)
8208         return -1;
8209
8210     //FIXME do something with unavailable reference frames
8211
8212 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8213     if(!s->current_picture_ptr){
8214         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8215         return -1;
8216     }
8217
8218     {
8219         Picture *out = s->current_picture_ptr;
8220 #if 0 //decode order
8221         *data_size = sizeof(AVFrame);
8222 #else
8223         /* Sort B-frames into display order */
8224         Picture *cur = s->current_picture_ptr;
8225         Picture *prev = h->delayed_output_pic;
8226         int i, pics, cross_idr, out_of_order, out_idx;
8227
8228         if(h->sps.bitstream_restriction_flag
8229            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8230             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8231             s->low_delay = 0;
8232         }
8233
8234         pics = 0;
8235         while(h->delayed_pic[pics]) pics++;
8236         h->delayed_pic[pics++] = cur;
8237         if(cur->reference == 0)
8238             cur->reference = 1;
8239
8240         cross_idr = 0;
8241         for(i=0; h->delayed_pic[i]; i++)
8242             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8243                 cross_idr = 1;
8244
8245         out = h->delayed_pic[0];
8246         out_idx = 0;
8247         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8248             if(h->delayed_pic[i]->poc < out->poc){
8249                 out = h->delayed_pic[i];
8250                 out_idx = i;
8251             }
8252
8253         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8254         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8255             { }
8256         else if(prev && pics <= s->avctx->has_b_frames)
8257             out = prev;
8258         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8259            || (s->low_delay &&
8260             ((!cross_idr && prev && out->poc > prev->poc + 2)
8261              || cur->pict_type == B_TYPE)))
8262         {
8263             s->low_delay = 0;
8264             s->avctx->has_b_frames++;
8265             out = prev;
8266         }
8267         else if(out_of_order)
8268             out = prev;
8269
8270         if(out_of_order || pics > s->avctx->has_b_frames){
8271             for(i=out_idx; h->delayed_pic[i]; i++)
8272                 h->delayed_pic[i] = h->delayed_pic[i+1];
8273         }
8274
8275         if(prev == out)
8276             *data_size = 0;
8277         else
8278             *data_size = sizeof(AVFrame);
8279         if(prev && prev != out && prev->reference == 1)
8280             prev->reference = 0;
8281         h->delayed_output_pic = out;
8282 #endif
8283
8284         if(out)
8285             *pict= *(AVFrame*)out;
8286         else
8287             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8288     }
8289
8290     assert(pict->data[0] || !*data_size);
8291     ff_print_debug_info(s, pict);
8292 //printf("out %d\n", (int)pict->data[0]);
8293 #if 0 //?
8294
8295     /* Return the Picture timestamp as the frame number */
8296     /* we substract 1 because it is added on utils.c    */
8297     avctx->frame_number = s->picture_number - 1;
8298 #endif
8299     return get_consumed_bytes(s, buf_index, buf_size);
8300 }
8301 #if 0
8302 static inline void fill_mb_avail(H264Context *h){
8303     MpegEncContext * const s = &h->s;
8304     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8305
8306     if(s->mb_y){
8307         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8308         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8309         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8310     }else{
8311         h->mb_avail[0]=
8312         h->mb_avail[1]=
8313         h->mb_avail[2]= 0;
8314     }
8315     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8316     h->mb_avail[4]= 1; //FIXME move out
8317     h->mb_avail[5]= 0; //FIXME move out
8318 }
8319 #endif
8320
8321 #if 0 //selftest
8322 #define COUNT 8000
8323 #define SIZE (COUNT*40)
8324 int main(){
8325     int i;
8326     uint8_t temp[SIZE];
8327     PutBitContext pb;
8328     GetBitContext gb;
8329 //    int int_temp[10000];
8330     DSPContext dsp;
8331     AVCodecContext avctx;
8332
8333     dsputil_init(&dsp, &avctx);
8334
8335     init_put_bits(&pb, temp, SIZE);
8336     printf("testing unsigned exp golomb\n");
8337     for(i=0; i<COUNT; i++){
8338         START_TIMER
8339         set_ue_golomb(&pb, i);
8340         STOP_TIMER("set_ue_golomb");
8341     }
8342     flush_put_bits(&pb);
8343
8344     init_get_bits(&gb, temp, 8*SIZE);
8345     for(i=0; i<COUNT; i++){
8346         int j, s;
8347
8348         s= show_bits(&gb, 24);
8349
8350         START_TIMER
8351         j= get_ue_golomb(&gb);
8352         if(j != i){
8353             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8354 //            return -1;
8355         }
8356         STOP_TIMER("get_ue_golomb");
8357     }
8358
8359
8360     init_put_bits(&pb, temp, SIZE);
8361     printf("testing signed exp golomb\n");
8362     for(i=0; i<COUNT; i++){
8363         START_TIMER
8364         set_se_golomb(&pb, i - COUNT/2);
8365         STOP_TIMER("set_se_golomb");
8366     }
8367     flush_put_bits(&pb);
8368
8369     init_get_bits(&gb, temp, 8*SIZE);
8370     for(i=0; i<COUNT; i++){
8371         int j, s;
8372
8373         s= show_bits(&gb, 24);
8374
8375         START_TIMER
8376         j= get_se_golomb(&gb);
8377         if(j != i - COUNT/2){
8378             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8379 //            return -1;
8380         }
8381         STOP_TIMER("get_se_golomb");
8382     }
8383
8384     printf("testing 4x4 (I)DCT\n");
8385
8386     DCTELEM block[16];
8387     uint8_t src[16], ref[16];
8388     uint64_t error= 0, max_error=0;
8389
8390     for(i=0; i<COUNT; i++){
8391         int j;
8392 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8393         for(j=0; j<16; j++){
8394             ref[j]= random()%255;
8395             src[j]= random()%255;
8396         }
8397
8398         h264_diff_dct_c(block, src, ref, 4);
8399
8400         //normalize
8401         for(j=0; j<16; j++){
8402 //            printf("%d ", block[j]);
8403             block[j]= block[j]*4;
8404             if(j&1) block[j]= (block[j]*4 + 2)/5;
8405             if(j&4) block[j]= (block[j]*4 + 2)/5;
8406         }
8407 //        printf("\n");
8408
8409         s->dsp.h264_idct_add(ref, block, 4);
8410 /*        for(j=0; j<16; j++){
8411             printf("%d ", ref[j]);
8412         }
8413         printf("\n");*/
8414
8415         for(j=0; j<16; j++){
8416             int diff= FFABS(src[j] - ref[j]);
8417
8418             error+= diff*diff;
8419             max_error= FFMAX(max_error, diff);
8420         }
8421     }
8422     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8423 #if 0
8424     printf("testing quantizer\n");
8425     for(qp=0; qp<52; qp++){
8426         for(i=0; i<16; i++)
8427             src1_block[i]= src2_block[i]= random()%255;
8428
8429     }
8430 #endif
8431     printf("Testing NAL layer\n");
8432
8433     uint8_t bitstream[COUNT];
8434     uint8_t nal[COUNT*2];
8435     H264Context h;
8436     memset(&h, 0, sizeof(H264Context));
8437
8438     for(i=0; i<COUNT; i++){
8439         int zeros= i;
8440         int nal_length;
8441         int consumed;
8442         int out_length;
8443         uint8_t *out;
8444         int j;
8445
8446         for(j=0; j<COUNT; j++){
8447             bitstream[j]= (random() % 255) + 1;
8448         }
8449
8450         for(j=0; j<zeros; j++){
8451             int pos= random() % COUNT;
8452             while(bitstream[pos] == 0){
8453                 pos++;
8454                 pos %= COUNT;
8455             }
8456             bitstream[pos]=0;
8457         }
8458
8459         START_TIMER
8460
8461         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8462         if(nal_length<0){
8463             printf("encoding failed\n");
8464             return -1;
8465         }
8466
8467         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8468
8469         STOP_TIMER("NAL")
8470
8471         if(out_length != COUNT){
8472             printf("incorrect length %d %d\n", out_length, COUNT);
8473             return -1;
8474         }
8475
8476         if(consumed != nal_length){
8477             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8478             return -1;
8479         }
8480
8481         if(memcmp(bitstream, out, COUNT)){
8482             printf("missmatch\n");
8483             return -1;
8484         }
8485     }
8486
8487     printf("Testing RBSP\n");
8488
8489
8490     return 0;
8491 }
8492 #endif
8493
8494
8495 static int decode_end(AVCodecContext *avctx)
8496 {
8497     H264Context *h = avctx->priv_data;
8498     MpegEncContext *s = &h->s;
8499
8500     av_freep(&h->rbsp_buffer);
8501     free_tables(h); //FIXME cleanup init stuff perhaps
8502     MPV_common_end(s);
8503
8504 //    memset(h, 0, sizeof(H264Context));
8505
8506     return 0;
8507 }
8508
8509
8510 AVCodec h264_decoder = {
8511     "h264",
8512     CODEC_TYPE_VIDEO,
8513     CODEC_ID_H264,
8514     sizeof(H264Context),
8515     decode_init,
8516     NULL,
8517     decode_end,
8518     decode_frame,
8519     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8520     .flush= flush_dpb,
8521 };
8522
8523 #ifdef CONFIG_H264_PARSER
8524 AVCodecParser h264_parser = {
8525     { CODEC_ID_H264 },
8526     sizeof(H264Context),
8527     NULL,
8528     h264_parse,
8529     ff_parse_close,
8530     h264_split,
8531 };
8532 #endif
8533
8534 #include "svq3.c"