git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 const uint8_t ff_rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 const uint8_t ff_div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 188             for(i=0; i<16; i++)
 189                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 190             for(list=0; list<h->list_count; list++){
 191                 if(USES_LIST(mb_type,list)){
 192                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 193                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 194                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 195                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 196                         dst[0] = src[0];
 197                         dst[1] = src[1];
 198                         dst[2] = src[2];
 199                         dst[3] = src[3];
 200                     }
 201                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 202                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 203                     ref += h->b8_stride;
 204                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 205                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 206                 }else{
 207                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 208                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 209                 }
 210             }
 211         }
 212     }else{
 213         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 214         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 215         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 216         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 217         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 218     }
 219
 220     if(IS_INTRA(mb_type)){
 221         h->topleft_samples_available=
 222         h->top_samples_available=
 223         h->left_samples_available= 0xFFFF;
 224         h->topright_samples_available= 0xEEEA;
 225
 226         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 227             h->topleft_samples_available= 0xB3FF;
 228             h->top_samples_available= 0x33FF;
 229             h->topright_samples_available= 0x26EA;
 230         }
 231         for(i=0; i<2; i++){
 232             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 233                 h->topleft_samples_available&= 0xDF5F;
 234                 h->left_samples_available&= 0x5F5F;
 235             }
 236         }
 237
 238         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 239             h->topleft_samples_available&= 0x7FFF;
 240
 241         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 242             h->topright_samples_available&= 0xFBFF;
 243
 244         if(IS_INTRA4x4(mb_type)){
 245             if(IS_INTRA4x4(top_type)){
 246                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 247                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 248                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 249                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 250             }else{
 251                 int pred;
 252                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 253                     pred= -1;
 254                 else{
 255                     pred= 2;
 256                 }
 257                 h->intra4x4_pred_mode_cache[4+8*0]=
 258                 h->intra4x4_pred_mode_cache[5+8*0]=
 259                 h->intra4x4_pred_mode_cache[6+8*0]=
 260                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 261             }
 262             for(i=0; i<2; i++){
 263                 if(IS_INTRA4x4(left_type[i])){
 264                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 265                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 266                 }else{
 267                     int pred;
 268                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 269                         pred= -1;
 270                     else{
 271                         pred= 2;
 272                     }
 273                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 274                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 275                 }
 276             }
 277         }
 278     }
 279
 280
 281 /*
 282 0 . T T. T T T T
 283 1 L . .L . . . .
 284 2 L . .L . . . .
 285 3 . T TL . . . .
 286 4 L . .L . . . .
 287 5 L . .. . . . .
 288 */
 289 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 290     if(top_type){
 291         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 292         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 293         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 294         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 295
 296         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 297         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 298
 299         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 300         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 301
 302     }else{
 303         h->non_zero_count_cache[4+8*0]=
 304         h->non_zero_count_cache[5+8*0]=
 305         h->non_zero_count_cache[6+8*0]=
 306         h->non_zero_count_cache[7+8*0]=
 307
 308         h->non_zero_count_cache[1+8*0]=
 309         h->non_zero_count_cache[2+8*0]=
 310
 311         h->non_zero_count_cache[1+8*3]=
 312         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 313
 314     }
 315
 316     for (i=0; i<2; i++) {
 317         if(left_type[i]){
 318             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 319             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 320             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 321             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 322         }else{
 323             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 324             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 325             h->non_zero_count_cache[0+8*1 +   8*i]=
 326             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 327         }
 328     }
 329
 330     if( h->pps.cabac ) {
 331         // top_cbp
 332         if(top_type) {
 333             h->top_cbp = h->cbp_table[top_xy];
 334         } else if(IS_INTRA(mb_type)) {
 335             h->top_cbp = 0x1C0;
 336         } else {
 337             h->top_cbp = 0;
 338         }
 339         // left_cbp
 340         if (left_type[0]) {
 341             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 342         } else if(IS_INTRA(mb_type)) {
 343             h->left_cbp = 0x1C0;
 344         } else {
 345             h->left_cbp = 0;
 346         }
 347         if (left_type[0]) {
 348             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 349         }
 350         if (left_type[1]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 352         }
 353     }
 354
 355 #if 1
 356     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 357         int list;
 358         for(list=0; list<h->list_count; list++){
 359             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 360                 /*if(!h->mv_cache_clean[list]){
 361                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 362                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 363                     h->mv_cache_clean[list]= 1;
 364                 }*/
 365                 continue;
 366             }
 367             h->mv_cache_clean[list]= 0;
 368
 369             if(USES_LIST(top_type, list)){
 370                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 371                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 372                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 373                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 376                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 377                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 378                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 380             }else{
 381                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 382                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 385                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 386             }
 387
 388             for(i=0; i<2; i++){
 389                 int cache_idx = scan8[0] - 1 + i*2*8;
 390                 if(USES_LIST(left_type[i], list)){
 391                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 392                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 393                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 394                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 395                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 396                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 397                 }else{
 398                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 399                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 400                     h->ref_cache[list][cache_idx  ]=
 401                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 402                 }
 403             }
 404
 405             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 406                 continue;
 407
 408             if(USES_LIST(topleft_type, list)){
 409                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 410                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 411                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 412                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 413             }else{
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 416             }
 417
 418             if(USES_LIST(topright_type, list)){
 419                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 420                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 421                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 422                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 423             }else{
 424                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 426             }
 427
 428             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 429                 continue;
 430
 431             h->ref_cache[list][scan8[5 ]+1] =
 432             h->ref_cache[list][scan8[7 ]+1] =
 433             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 434             h->ref_cache[list][scan8[4 ]] =
 435             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 436             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 437             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 438             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 439             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 440             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 441
 442             if( h->pps.cabac ) {
 443                 /* XXX beurk, Load mvd */
 444                 if(USES_LIST(top_type, list)){
 445                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 446                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 447                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 450                 }else{
 451                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 452                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 455                 }
 456                 if(USES_LIST(left_type[0], list)){
 457                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 459                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 460                 }else{
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 463                 }
 464                 if(USES_LIST(left_type[1], list)){
 465                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 466                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 468                 }else{
 469                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 471                 }
 472                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 473                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 474                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 475                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 477
 478                 if(h->slice_type_nos == FF_B_TYPE){
 479                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 480
 481                     if(IS_DIRECT(top_type)){
 482                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 483                     }else if(IS_8X8(top_type)){
 484                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 485                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 486                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 487                     }else{
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 489                     }
 490
 491                     if(IS_DIRECT(left_type[0]))
 492                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 493                     else if(IS_8X8(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 495                     else
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 497
 498                     if(IS_DIRECT(left_type[1]))
 499                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 500                     else if(IS_8X8(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 502                     else
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 504                 }
 505             }
 506
 507             if(FRAME_MBAFF){
 508 #define MAP_MVS\
 509                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 510                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 511                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 512                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 515                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 516                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 517                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 518                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 519                 if(MB_FIELD){
 520 #define MAP_F2F(idx, mb_type)\
 521                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 522                         h->ref_cache[list][idx] <<= 1;\
 523                         h->mv_cache[list][idx][1] /= 2;\
 524                         h->mvd_cache[list][idx][1] /= 2;\
 525                     }
 526                     MAP_MVS
 527 #undef MAP_F2F
 528                 }else{
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] >>= 1;\
 532                         h->mv_cache[list][idx][1] <<= 1;\
 533                         h->mvd_cache[list][idx][1] <<= 1;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }
 538             }
 539         }
 540     }
 541 #endif
 542
 543     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 544 }
 545
 546 static inline void write_back_intra_pred_mode(H264Context *h){
 547     const int mb_xy= h->mb_xy;
 548
 549     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 550     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 551     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 552     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 553     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 554     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 555     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 556 }
 557
 558 /**
 559  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 560  */
 561 static inline int check_intra4x4_pred_mode(H264Context *h){
 562     MpegEncContext * const s = &h->s;
 563     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 564     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 565     int i;
 566
 567     if(!(h->top_samples_available&0x8000)){
 568         for(i=0; i<4; i++){
 569             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 570             if(status<0){
 571                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 572                 return -1;
 573             } else if(status){
 574                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 575             }
 576         }
 577     }
 578
 579     if(!(h->left_samples_available&0x8000)){
 580         for(i=0; i<4; i++){
 581             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 582             if(status<0){
 583                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 584                 return -1;
 585             } else if(status){
 586                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 587             }
 588         }
 589     }
 590
 591     return 0;
 592 } //FIXME cleanup like next
 593
 594 /**
 595  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 596  */
 597 static inline int check_intra_pred_mode(H264Context *h, int mode){
 598     MpegEncContext * const s = &h->s;
 599     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 600     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 601
 602     if(mode > 6U) {
 603         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 604         return -1;
 605     }
 606
 607     if(!(h->top_samples_available&0x8000)){
 608         mode= top[ mode ];
 609         if(mode<0){
 610             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 611             return -1;
 612         }
 613     }
 614
 615     if(!(h->left_samples_available&0x8000)){
 616         mode= left[ mode ];
 617         if(mode<0){
 618             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 619             return -1;
 620         }
 621     }
 622
 623     return mode;
 624 }
 625
 626 /**
 627  * gets the predicted intra4x4 prediction mode.
 628  */
 629 static inline int pred_intra_mode(H264Context *h, int n){
 630     const int index8= scan8[n];
 631     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 632     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 633     const int min= FFMIN(left, top);
 634
 635     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 636
 637     if(min<0) return DC_PRED;
 638     else      return min;
 639 }
 640
 641 static inline void write_back_non_zero_count(H264Context *h){
 642     const int mb_xy= h->mb_xy;
 643
 644     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 645     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 646     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 647     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 648     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 649     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 650     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 651
 652     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 653     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 654     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 655
 656     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 657     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 658     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 659
 660     if(FRAME_MBAFF){
 661         // store all luma nnzs, for deblocking
 662         int v = 0, i;
 663         for(i=0; i<16; i++)
 664             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 665         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 666     }
 667 }
 668
 669 /**
 670  * gets the predicted number of non-zero coefficients.
 671  * @param n block index
 672  */
 673 static inline int pred_non_zero_count(H264Context *h, int n){
 674     const int index8= scan8[n];
 675     const int left= h->non_zero_count_cache[index8 - 1];
 676     const int top = h->non_zero_count_cache[index8 - 8];
 677     int i= left + top;
 678
 679     if(i<64) i= (i+1)>>1;
 680
 681     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 682
 683     return i&31;
 684 }
 685
 686 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 687     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 688     MpegEncContext *s = &h->s;
 689
 690     /* there is no consistent mapping of mvs to neighboring locations that will
 691      * make mbaff happy, so we can't move all this logic to fill_caches */
 692     if(FRAME_MBAFF){
 693         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 694         const int16_t *mv;
 695         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 696         *C = h->mv_cache[list][scan8[0]-2];
 697
 698         if(!MB_FIELD
 699            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 700             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 701             if(IS_INTERLACED(mb_types[topright_xy])){
 702 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 703                 const int x4 = X4, y4 = Y4;\
 704                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 705                 if(!USES_LIST(mb_type,list))\
 706                     return LIST_NOT_USED;\
 707                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 708                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 709                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 710                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 711
 712                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 713             }
 714         }
 715         if(topright_ref == PART_NOT_AVAILABLE
 716            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 717            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 718             if(!MB_FIELD
 719                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 721             }
 722             if(MB_FIELD
 723                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 724                && i >= scan8[0]+8){
 725                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 726                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 727             }
 728         }
 729 #undef SET_DIAG_MV
 730     }
 731
 732     if(topright_ref != PART_NOT_AVAILABLE){
 733         *C= h->mv_cache[list][ i - 8 + part_width ];
 734         return topright_ref;
 735     }else{
 736         tprintf(s->avctx, "topright MV not available\n");
 737
 738         *C= h->mv_cache[list][ i - 8 - 1 ];
 739         return h->ref_cache[list][ i - 8 - 1 ];
 740     }
 741 }
 742
 743 /**
 744  * gets the predicted MV.
 745  * @param n the block index
 746  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 747  * @param mx the x component of the predicted motion vector
 748  * @param my the y component of the predicted motion vector
 749  */
 750 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 751     const int index8= scan8[n];
 752     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 753     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 754     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 755     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 756     const int16_t * C;
 757     int diagonal_ref, match_count;
 758
 759     assert(part_width==1 || part_width==2 || part_width==4);
 760
 761 /* mv_cache
 762   B . . A T T T T
 763   U . . L . . , .
 764   U . . L . . . .
 765   U . . L . . , .
 766   . . . L . . . .
 767 */
 768
 769     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 770     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 771     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 772     if(match_count > 1){ //most common
 773         *mx= mid_pred(A[0], B[0], C[0]);
 774         *my= mid_pred(A[1], B[1], C[1]);
 775     }else if(match_count==1){
 776         if(left_ref==ref){
 777             *mx= A[0];
 778             *my= A[1];
 779         }else if(top_ref==ref){
 780             *mx= B[0];
 781             *my= B[1];
 782         }else{
 783             *mx= C[0];
 784             *my= C[1];
 785         }
 786     }else{
 787         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 788             *mx= A[0];
 789             *my= A[1];
 790         }else{
 791             *mx= mid_pred(A[0], B[0], C[0]);
 792             *my= mid_pred(A[1], B[1], C[1]);
 793         }
 794     }
 795
 796     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 797 }
 798
 799 /**
 800  * gets the directionally predicted 16x8 MV.
 801  * @param n the block index
 802  * @param mx the x component of the predicted motion vector
 803  * @param my the y component of the predicted motion vector
 804  */
 805 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 806     if(n==0){
 807         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 808         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 809
 810         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 811
 812         if(top_ref == ref){
 813             *mx= B[0];
 814             *my= B[1];
 815             return;
 816         }
 817     }else{
 818         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 819         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 820
 821         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 822
 823         if(left_ref == ref){
 824             *mx= A[0];
 825             *my= A[1];
 826             return;
 827         }
 828     }
 829
 830     //RARE
 831     pred_motion(h, n, 4, list, ref, mx, my);
 832 }
 833
 834 /**
 835  * gets the directionally predicted 8x16 MV.
 836  * @param n the block index
 837  * @param mx the x component of the predicted motion vector
 838  * @param my the y component of the predicted motion vector
 839  */
 840 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 841     if(n==0){
 842         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 843         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 844
 845         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 846
 847         if(left_ref == ref){
 848             *mx= A[0];
 849             *my= A[1];
 850             return;
 851         }
 852     }else{
 853         const int16_t * C;
 854         int diagonal_ref;
 855
 856         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 857
 858         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 859
 860         if(diagonal_ref == ref){
 861             *mx= C[0];
 862             *my= C[1];
 863             return;
 864         }
 865     }
 866
 867     //RARE
 868     pred_motion(h, n, 2, list, ref, mx, my);
 869 }
 870
 871 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 872     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 873     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 874
 875     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 876
 877     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 878        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 879        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 880
 881         *mx = *my = 0;
 882         return;
 883     }
 884
 885     pred_motion(h, 0, 4, 0, 0, mx, my);
 886
 887     return;
 888 }
 889
 890 static inline void direct_dist_scale_factor(H264Context * const h){
 891     MpegEncContext * const s = &h->s;
 892     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 893     const int poc1 = h->ref_list[1][0].poc;
 894     int i;
 895     for(i=0; i<h->ref_count[0]; i++){
 896         int poc0 = h->ref_list[0][i].poc;
 897         int td = av_clip(poc1 - poc0, -128, 127);
 898         if(td == 0 || h->ref_list[0][i].long_ref){
 899             h->dist_scale_factor[i] = 256;
 900         }else{
 901             int tb = av_clip(poc - poc0, -128, 127);
 902             int tx = (16384 + (FFABS(td) >> 1)) / td;
 903             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 904         }
 905     }
 906     if(FRAME_MBAFF){
 907         for(i=0; i<h->ref_count[0]; i++){
 908             h->dist_scale_factor_field[2*i] =
 909             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 910         }
 911     }
 912 }
 913 static inline void direct_ref_list_init(H264Context * const h){
 914     MpegEncContext * const s = &h->s;
 915     Picture * const ref1 = &h->ref_list[1][0];
 916     Picture * const cur = s->current_picture_ptr;
 917     int list, i, j;
 918     int sidx= s->picture_structure&1;
 919     int ref1sidx= ref1->reference&1;
 920     for(list=0; list<2; list++){
 921         cur->ref_count[sidx][list] = h->ref_count[list];
 922         for(j=0; j<h->ref_count[list]; j++)
 923             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 924     }
 925     if(s->picture_structure == PICT_FRAME){
 926         memcpy(cur->ref_count[0], cur->ref_count[1], sizeof(cur->ref_count[0]));
 927         memcpy(cur->ref_poc  [0], cur->ref_poc  [1], sizeof(cur->ref_poc  [0]));
 928     }
 929     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 930         return;
 931     for(list=0; list<2; list++){
 932         for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 933             int poc = ref1->ref_poc[ref1sidx][list][i];
 934             if(((poc&3) == 3) != (s->picture_structure == PICT_FRAME))
 935                 poc= (poc&~3) + s->picture_structure;
 936             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 937             for(j=0; j<h->ref_count[list]; j++)
 938                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 939                     h->map_col_to_list0[list][i] = j;
 940                     break;
 941                 }
 942         }
 943     }
 944     if(FRAME_MBAFF){
 945         for(list=0; list<2; list++){
 946             for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 947                 j = h->map_col_to_list0[list][i];
 948                 h->map_col_to_list0_field[list][2*i] = 2*j;
 949                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 950             }
 951         }
 952     }
 953 }
 954
 955 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 956     MpegEncContext * const s = &h->s;
 957     int b8_stride = h->b8_stride;
 958     int b4_stride = h->b_stride;
 959     int mb_xy = h->mb_xy;
 960     int mb_type_col[2];
 961     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 962     const int8_t *l1ref0, *l1ref1;
 963     const int is_b8x8 = IS_8X8(*mb_type);
 964     unsigned int sub_mb_type;
 965     int i8, i4;
 966
 967 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 968
 969     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 970         if(h->ref_list[1][0].reference == PICT_FRAME){   // AFL/AFR/FR/FL -> AFL
 971             if(!IS_INTERLACED(*mb_type)){                //     AFR/FR    -> AFL
 972                 int cur_poc = s->current_picture_ptr->poc;
 973                 int *col_poc = h->ref_list[1]->field_poc;
 974                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
 975                 mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
 976                 l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
 977                 l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
 978                 l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
 979                 l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
 980                 if(s->mb_y&1){
 981                     l1ref0 +=   b8_stride;
 982                     l1ref1 +=   b8_stride;
 983                     l1mv0  += 2*b4_stride;
 984                     l1mv1  += 2*b4_stride;
 985                 }
 986                 b8_stride = 0;
 987             }
 988         }else if(!(s->picture_structure & h->ref_list[1][0].reference)){// FL -> FL & differ parity
 989             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
 990             mb_xy += s->mb_stride*fieldoff;
 991         }
 992         goto single_col;
 993     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
 994         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
 995             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
 996             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
 997             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
 998             b8_stride *= 3;
 999             b4_stride *= 6;
1000             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1001             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1002                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1003                 && !is_b8x8){
1004                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1005                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1006             }else{
1007                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1008                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1009             }
1010         }else{                                           //     AFR/FR    -> AFR/FR
1011 single_col:
1012             mb_type_col[0] =
1013             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1014             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1015                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1016                 * so we know exactly what block size to use */
1017                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1018                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1019             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1020                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1021                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1022             }else{
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1025             }
1026         }
1027     }
1028
1029     if(b8_stride){
1030         l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1031         l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1032         l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1033         l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1034     }
1035
1036     if(h->direct_spatial_mv_pred){
1037         int ref[2];
1038         int mv[2][2];
1039         int list;
1040
1041         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1042
1043         /* ref = min(neighbors) */
1044         for(list=0; list<2; list++){
1045             int refa = h->ref_cache[list][scan8[0] - 1];
1046             int refb = h->ref_cache[list][scan8[0] - 8];
1047             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1048             if(refc == PART_NOT_AVAILABLE)
1049                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1050             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1051             if(ref[list] < 0)
1052                 ref[list] = -1;
1053         }
1054
1055         if(ref[0] < 0 && ref[1] < 0){
1056             ref[0] = ref[1] = 0;
1057             mv[0][0] = mv[0][1] =
1058             mv[1][0] = mv[1][1] = 0;
1059         }else{
1060             for(list=0; list<2; list++){
1061                 if(ref[list] >= 0)
1062                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1063                 else
1064                     mv[list][0] = mv[list][1] = 0;
1065             }
1066         }
1067
1068         if(ref[1] < 0){
1069             if(!is_b8x8)
1070                 *mb_type &= ~MB_TYPE_L1;
1071             sub_mb_type &= ~MB_TYPE_L1;
1072         }else if(ref[0] < 0){
1073             if(!is_b8x8)
1074                 *mb_type &= ~MB_TYPE_L0;
1075             sub_mb_type &= ~MB_TYPE_L0;
1076         }
1077
1078         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1079             for(i8=0; i8<4; i8++){
1080                 int x8 = i8&1;
1081                 int y8 = i8>>1;
1082                 int xy8 = x8+y8*b8_stride;
1083                 int xy4 = 3*x8+y8*b4_stride;
1084                 int a=0, b=0;
1085
1086                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1087                     continue;
1088                 h->sub_mb_type[i8] = sub_mb_type;
1089
1090                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1091                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1092                 if(!IS_INTRA(mb_type_col[y8])
1093                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1094                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1095                     if(ref[0] > 0)
1096                         a= pack16to32(mv[0][0],mv[0][1]);
1097                     if(ref[1] > 0)
1098                         b= pack16to32(mv[1][0],mv[1][1]);
1099                 }else{
1100                     a= pack16to32(mv[0][0],mv[0][1]);
1101                     b= pack16to32(mv[1][0],mv[1][1]);
1102                 }
1103                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1104                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1105             }
1106         }else if(IS_16X16(*mb_type)){
1107             int a=0, b=0;
1108
1109             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1110             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1111             if(!IS_INTRA(mb_type_col[0])
1112                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1113                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1114                        && (h->x264_build>33 || !h->x264_build)))){
1115                 if(ref[0] > 0)
1116                     a= pack16to32(mv[0][0],mv[0][1]);
1117                 if(ref[1] > 0)
1118                     b= pack16to32(mv[1][0],mv[1][1]);
1119             }else{
1120                 a= pack16to32(mv[0][0],mv[0][1]);
1121                 b= pack16to32(mv[1][0],mv[1][1]);
1122             }
1123             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1124             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1125         }else{
1126             for(i8=0; i8<4; i8++){
1127                 const int x8 = i8&1;
1128                 const int y8 = i8>>1;
1129
1130                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1131                     continue;
1132                 h->sub_mb_type[i8] = sub_mb_type;
1133
1134                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1135                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1136                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1137                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1138
1139                 /* col_zero_flag */
1140                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1141                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1142                                                   && (h->x264_build>33 || !h->x264_build)))){
1143                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1144                     if(IS_SUB_8X8(sub_mb_type)){
1145                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1146                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1147                             if(ref[0] == 0)
1148                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1149                             if(ref[1] == 0)
1150                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1151                         }
1152                     }else
1153                     for(i4=0; i4<4; i4++){
1154                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1155                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1156                             if(ref[0] == 0)
1157                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1158                             if(ref[1] == 0)
1159                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1160                         }
1161                     }
1162                 }
1163             }
1164         }
1165     }else{ /* direct temporal mv pred */
1166         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1167         const int *dist_scale_factor = h->dist_scale_factor;
1168
1169         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1170             map_col_to_list0[0] = h->map_col_to_list0_field[0];
1171             map_col_to_list0[1] = h->map_col_to_list0_field[1];
1172             dist_scale_factor = h->dist_scale_factor_field;
1173         }
1174         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1175             /* FIXME assumes direct_8x8_inference == 1 */
1176             int y_shift;
1177             int ref_shift;
1178
1179             if(IS_INTERLACED(*mb_type)){
1180                 /* frame to field scaling */
1181                 y_shift = 0;
1182                 ref_shift= FRAME_MBAFF ? 0 : 1;
1183             }else{
1184                 y_shift = 2;
1185                 ref_shift= FRAME_MBAFF ? 2 : 1;
1186             }
1187
1188             for(i8=0; i8<4; i8++){
1189                 const int x8 = i8&1;
1190                 const int y8 = i8>>1;
1191                 int ref0, scale;
1192                 const int16_t (*l1mv)[2]= l1mv0;
1193
1194                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1195                     continue;
1196                 h->sub_mb_type[i8] = sub_mb_type;
1197
1198                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1199                 if(IS_INTRA(mb_type_col[y8])){
1200                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1201                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1202                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1203                     continue;
1204                 }
1205
1206                 ref0 = l1ref0[x8 + y8*b8_stride];
1207                 if(ref0 >= 0)
1208                     ref0 = map_col_to_list0[0][ref0*2>>ref_shift];
1209                 else{
1210                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride]*2>>ref_shift];
1211                     l1mv= l1mv1;
1212                 }
1213                 scale = dist_scale_factor[ref0];
1214                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1215
1216                 {
1217                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1218                     int my_col = (mv_col[1]<<y_shift)/2;
1219                     int mx = (scale * mv_col[0] + 128) >> 8;
1220                     int my = (scale * my_col + 128) >> 8;
1221                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1222                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1223                 }
1224             }
1225             return;
1226         }
1227
1228         /* one-to-one mv scaling */
1229
1230         if(IS_16X16(*mb_type)){
1231             int ref, mv0, mv1;
1232
1233             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1234             if(IS_INTRA(mb_type_col[0])){
1235                 ref=mv0=mv1=0;
1236             }else{
1237                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1238                                                 : map_col_to_list0[1][l1ref1[0]];
1239                 const int scale = dist_scale_factor[ref0];
1240                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1241                 int mv_l0[2];
1242                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1243                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1244                 ref= ref0;
1245                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1246                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1247             }
1248             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1249             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1250             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1251         }else{
1252             for(i8=0; i8<4; i8++){
1253                 const int x8 = i8&1;
1254                 const int y8 = i8>>1;
1255                 int ref0, scale;
1256                 const int16_t (*l1mv)[2]= l1mv0;
1257
1258                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1259                     continue;
1260                 h->sub_mb_type[i8] = sub_mb_type;
1261                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1262                 if(IS_INTRA(mb_type_col[0])){
1263                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1264                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1265                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1266                     continue;
1267                 }
1268
1269                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1270                 if(ref0 >= 0)
1271                     ref0 = map_col_to_list0[0][ref0];
1272                 else{
1273                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1274                     l1mv= l1mv1;
1275                 }
1276                 scale = dist_scale_factor[ref0];
1277
1278                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1279                 if(IS_SUB_8X8(sub_mb_type)){
1280                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1281                     int mx = (scale * mv_col[0] + 128) >> 8;
1282                     int my = (scale * mv_col[1] + 128) >> 8;
1283                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1284                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1285                 }else
1286                 for(i4=0; i4<4; i4++){
1287                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1288                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1289                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1290                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1291                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1292                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1293                 }
1294             }
1295         }
1296     }
1297 }
1298
1299 static inline void write_back_motion(H264Context *h, int mb_type){
1300     MpegEncContext * const s = &h->s;
1301     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1302     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1303     int list;
1304
1305     if(!USES_LIST(mb_type, 0))
1306         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1307
1308     for(list=0; list<h->list_count; list++){
1309         int y;
1310         if(!USES_LIST(mb_type, list))
1311             continue;
1312
1313         for(y=0; y<4; y++){
1314             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1315             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1316         }
1317         if( h->pps.cabac ) {
1318             if(IS_SKIP(mb_type))
1319                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1320             else
1321             for(y=0; y<4; y++){
1322                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1323                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1324             }
1325         }
1326
1327         {
1328             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1329             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1330             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1331             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1332             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1333         }
1334     }
1335
1336     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1337         if(IS_8X8(mb_type)){
1338             uint8_t *direct_table = &h->direct_table[b8_xy];
1339             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1340             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1341             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1342         }
1343     }
1344 }
1345
1346 /**
1347  * Decodes a network abstraction layer unit.
1348  * @param consumed is the number of bytes used as input
1349  * @param length is the length of the array
1350  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1351  * @returns decoded bytes, might be src+1 if no escapes
1352  */
1353 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1354     int i, si, di;
1355     uint8_t *dst;
1356     int bufidx;
1357
1358 //    src[0]&0x80;                //forbidden bit
1359     h->nal_ref_idc= src[0]>>5;
1360     h->nal_unit_type= src[0]&0x1F;
1361
1362     src++; length--;
1363 #if 0
1364     for(i=0; i<length; i++)
1365         printf("%2X ", src[i]);
1366 #endif
1367     for(i=0; i+1<length; i+=2){
1368         if(src[i]) continue;
1369         if(i>0 && src[i-1]==0) i--;
1370         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1371             if(src[i+2]!=3){
1372                 /* startcode, so we must be past the end */
1373                 length=i;
1374             }
1375             break;
1376         }
1377     }
1378
1379     if(i>=length-1){ //no escaped 0
1380         *dst_length= length;
1381         *consumed= length+1; //+1 for the header
1382         return src;
1383     }
1384
1385     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1386     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1387     dst= h->rbsp_buffer[bufidx];
1388
1389     if (dst == NULL){
1390         return NULL;
1391     }
1392
1393 //printf("decoding esc\n");
1394     si=di=0;
1395     while(si<length){
1396         //remove escapes (very rare 1:2^22)
1397         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1398             if(src[si+2]==3){ //escape
1399                 dst[di++]= 0;
1400                 dst[di++]= 0;
1401                 si+=3;
1402                 continue;
1403             }else //next start code
1404                 break;
1405         }
1406
1407         dst[di++]= src[si++];
1408     }
1409
1410     *dst_length= di;
1411     *consumed= si + 1;//+1 for the header
1412 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1413     return dst;
1414 }
1415
1416 /**
1417  * identifies the exact end of the bitstream
1418  * @return the length of the trailing, or 0 if damaged
1419  */
1420 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1421     int v= *src;
1422     int r;
1423
1424     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1425
1426     for(r=1; r<9; r++){
1427         if(v&1) return r;
1428         v>>=1;
1429     }
1430     return 0;
1431 }
1432
1433 /**
1434  * IDCT transforms the 16 dc values and dequantizes them.
1435  * @param qp quantization parameter
1436  */
1437 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1438 #define stride 16
1439     int i;
1440     int temp[16]; //FIXME check if this is a good idea
1441     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1442     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1443
1444 //memset(block, 64, 2*256);
1445 //return;
1446     for(i=0; i<4; i++){
1447         const int offset= y_offset[i];
1448         const int z0= block[offset+stride*0] + block[offset+stride*4];
1449         const int z1= block[offset+stride*0] - block[offset+stride*4];
1450         const int z2= block[offset+stride*1] - block[offset+stride*5];
1451         const int z3= block[offset+stride*1] + block[offset+stride*5];
1452
1453         temp[4*i+0]= z0+z3;
1454         temp[4*i+1]= z1+z2;
1455         temp[4*i+2]= z1-z2;
1456         temp[4*i+3]= z0-z3;
1457     }
1458
1459     for(i=0; i<4; i++){
1460         const int offset= x_offset[i];
1461         const int z0= temp[4*0+i] + temp[4*2+i];
1462         const int z1= temp[4*0+i] - temp[4*2+i];
1463         const int z2= temp[4*1+i] - temp[4*3+i];
1464         const int z3= temp[4*1+i] + temp[4*3+i];
1465
1466         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1467         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1468         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1469         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1470     }
1471 }
1472
1473 #if 0
1474 /**
1475  * DCT transforms the 16 dc values.
1476  * @param qp quantization parameter ??? FIXME
1477  */
1478 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1479 //    const int qmul= dequant_coeff[qp][0];
1480     int i;
1481     int temp[16]; //FIXME check if this is a good idea
1482     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1483     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1484
1485     for(i=0; i<4; i++){
1486         const int offset= y_offset[i];
1487         const int z0= block[offset+stride*0] + block[offset+stride*4];
1488         const int z1= block[offset+stride*0] - block[offset+stride*4];
1489         const int z2= block[offset+stride*1] - block[offset+stride*5];
1490         const int z3= block[offset+stride*1] + block[offset+stride*5];
1491
1492         temp[4*i+0]= z0+z3;
1493         temp[4*i+1]= z1+z2;
1494         temp[4*i+2]= z1-z2;
1495         temp[4*i+3]= z0-z3;
1496     }
1497
1498     for(i=0; i<4; i++){
1499         const int offset= x_offset[i];
1500         const int z0= temp[4*0+i] + temp[4*2+i];
1501         const int z1= temp[4*0+i] - temp[4*2+i];
1502         const int z2= temp[4*1+i] - temp[4*3+i];
1503         const int z3= temp[4*1+i] + temp[4*3+i];
1504
1505         block[stride*0 +offset]= (z0 + z3)>>1;
1506         block[stride*2 +offset]= (z1 + z2)>>1;
1507         block[stride*8 +offset]= (z1 - z2)>>1;
1508         block[stride*10+offset]= (z0 - z3)>>1;
1509     }
1510 }
1511 #endif
1512
1513 #undef xStride
1514 #undef stride
1515
1516 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1517     const int stride= 16*2;
1518     const int xStride= 16;
1519     int a,b,c,d,e;
1520
1521     a= block[stride*0 + xStride*0];
1522     b= block[stride*0 + xStride*1];
1523     c= block[stride*1 + xStride*0];
1524     d= block[stride*1 + xStride*1];
1525
1526     e= a-b;
1527     a= a+b;
1528     b= c-d;
1529     c= c+d;
1530
1531     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1532     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1533     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1534     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1535 }
1536
1537 #if 0
1538 static void chroma_dc_dct_c(DCTELEM *block){
1539     const int stride= 16*2;
1540     const int xStride= 16;
1541     int a,b,c,d,e;
1542
1543     a= block[stride*0 + xStride*0];
1544     b= block[stride*0 + xStride*1];
1545     c= block[stride*1 + xStride*0];
1546     d= block[stride*1 + xStride*1];
1547
1548     e= a-b;
1549     a= a+b;
1550     b= c-d;
1551     c= c+d;
1552
1553     block[stride*0 + xStride*0]= (a+c);
1554     block[stride*0 + xStride*1]= (e+b);
1555     block[stride*1 + xStride*0]= (a-c);
1556     block[stride*1 + xStride*1]= (e-b);
1557 }
1558 #endif
1559
1560 /**
1561  * gets the chroma qp.
1562  */
1563 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1564     return h->pps.chroma_qp_table[t][qscale];
1565 }
1566
1567 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1568 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1569 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1570     int i;
1571     const int * const quant_table= quant_coeff[qscale];
1572     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1573     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1574     const unsigned int threshold2= (threshold1<<1);
1575     int last_non_zero;
1576
1577     if(separate_dc){
1578         if(qscale<=18){
1579             //avoid overflows
1580             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1581             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1582             const unsigned int dc_threshold2= (dc_threshold1<<1);
1583
1584             int level= block[0]*quant_coeff[qscale+18][0];
1585             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1586                 if(level>0){
1587                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1588                     block[0]= level;
1589                 }else{
1590                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1591                     block[0]= -level;
1592                 }
1593 //                last_non_zero = i;
1594             }else{
1595                 block[0]=0;
1596             }
1597         }else{
1598             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1599             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1600             const unsigned int dc_threshold2= (dc_threshold1<<1);
1601
1602             int level= block[0]*quant_table[0];
1603             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1604                 if(level>0){
1605                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1606                     block[0]= level;
1607                 }else{
1608                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1609                     block[0]= -level;
1610                 }
1611 //                last_non_zero = i;
1612             }else{
1613                 block[0]=0;
1614             }
1615         }
1616         last_non_zero= 0;
1617         i=1;
1618     }else{
1619         last_non_zero= -1;
1620         i=0;
1621     }
1622
1623     for(; i<16; i++){
1624         const int j= scantable[i];
1625         int level= block[j]*quant_table[j];
1626
1627 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1628 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1629         if(((unsigned)(level+threshold1))>threshold2){
1630             if(level>0){
1631                 level= (bias + level)>>QUANT_SHIFT;
1632                 block[j]= level;
1633             }else{
1634                 level= (bias - level)>>QUANT_SHIFT;
1635                 block[j]= -level;
1636             }
1637             last_non_zero = i;
1638         }else{
1639             block[j]=0;
1640         }
1641     }
1642
1643     return last_non_zero;
1644 }
1645
1646 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1647                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1648                            int src_x_offset, int src_y_offset,
1649                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1650     MpegEncContext * const s = &h->s;
1651     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1652     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1653     const int luma_xy= (mx&3) + ((my&3)<<2);
1654     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1655     uint8_t * src_cb, * src_cr;
1656     int extra_width= h->emu_edge_width;
1657     int extra_height= h->emu_edge_height;
1658     int emu=0;
1659     const int full_mx= mx>>2;
1660     const int full_my= my>>2;
1661     const int pic_width  = 16*s->mb_width;
1662     const int pic_height = 16*s->mb_height >> MB_FIELD;
1663
1664     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1665         return;
1666
1667     if(mx&7) extra_width -= 3;
1668     if(my&7) extra_height -= 3;
1669
1670     if(   full_mx < 0-extra_width
1671        || full_my < 0-extra_height
1672        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1673        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1674         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1675             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1676         emu=1;
1677     }
1678
1679     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1680     if(!square){
1681         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1682     }
1683
1684     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1685
1686     if(MB_FIELD){
1687         // chroma offset when predicting from a field of opposite parity
1688         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1689         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1690     }
1691     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1692     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1693
1694     if(emu){
1695         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1696             src_cb= s->edge_emu_buffer;
1697     }
1698     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1699
1700     if(emu){
1701         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1702             src_cr= s->edge_emu_buffer;
1703     }
1704     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1705 }
1706
1707 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1708                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1709                            int x_offset, int y_offset,
1710                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1711                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1712                            int list0, int list1){
1713     MpegEncContext * const s = &h->s;
1714     qpel_mc_func *qpix_op=  qpix_put;
1715     h264_chroma_mc_func chroma_op= chroma_put;
1716
1717     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1718     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1719     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1720     x_offset += 8*s->mb_x;
1721     y_offset += 8*(s->mb_y >> MB_FIELD);
1722
1723     if(list0){
1724         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1725         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1726                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1727                            qpix_op, chroma_op);
1728
1729         qpix_op=  qpix_avg;
1730         chroma_op= chroma_avg;
1731     }
1732
1733     if(list1){
1734         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1735         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1736                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1737                            qpix_op, chroma_op);
1738     }
1739 }
1740
1741 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1742                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1743                            int x_offset, int y_offset,
1744                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1745                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1746                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1747                            int list0, int list1){
1748     MpegEncContext * const s = &h->s;
1749
1750     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1751     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1752     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1753     x_offset += 8*s->mb_x;
1754     y_offset += 8*(s->mb_y >> MB_FIELD);
1755
1756     if(list0 && list1){
1757         /* don't optimize for luma-only case, since B-frames usually
1758          * use implicit weights => chroma too. */
1759         uint8_t *tmp_cb = s->obmc_scratchpad;
1760         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1761         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1762         int refn0 = h->ref_cache[0][ scan8[n] ];
1763         int refn1 = h->ref_cache[1][ scan8[n] ];
1764
1765         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1766                     dest_y, dest_cb, dest_cr,
1767                     x_offset, y_offset, qpix_put, chroma_put);
1768         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1769                     tmp_y, tmp_cb, tmp_cr,
1770                     x_offset, y_offset, qpix_put, chroma_put);
1771
1772         if(h->use_weight == 2){
1773             int weight0 = h->implicit_weight[refn0][refn1];
1774             int weight1 = 64 - weight0;
1775             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1776             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1777             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1778         }else{
1779             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1780                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1781                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1782             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1783                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1784                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1785             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1786                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1787                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1788         }
1789     }else{
1790         int list = list1 ? 1 : 0;
1791         int refn = h->ref_cache[list][ scan8[n] ];
1792         Picture *ref= &h->ref_list[list][refn];
1793         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1794                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1795                     qpix_put, chroma_put);
1796
1797         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1798                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1799         if(h->use_weight_chroma){
1800             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1801                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1802             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1803                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1804         }
1805     }
1806 }
1807
1808 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1809                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1810                            int x_offset, int y_offset,
1811                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1812                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1813                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1814                            int list0, int list1){
1815     if((h->use_weight==2 && list0 && list1
1816         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1817        || h->use_weight==1)
1818         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1819                          x_offset, y_offset, qpix_put, chroma_put,
1820                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1821     else
1822         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1823                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1824 }
1825
1826 static inline void prefetch_motion(H264Context *h, int list){
1827     /* fetch pixels for estimated mv 4 macroblocks ahead
1828      * optimized for 64byte cache lines */
1829     MpegEncContext * const s = &h->s;
1830     const int refn = h->ref_cache[list][scan8[0]];
1831     if(refn >= 0){
1832         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1833         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1834         uint8_t **src= h->ref_list[list][refn].data;
1835         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1836         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1837         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1838         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1839     }
1840 }
1841
1842 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1843                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1844                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1845                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1846     MpegEncContext * const s = &h->s;
1847     const int mb_xy= h->mb_xy;
1848     const int mb_type= s->current_picture.mb_type[mb_xy];
1849
1850     assert(IS_INTER(mb_type));
1851
1852     prefetch_motion(h, 0);
1853
1854     if(IS_16X16(mb_type)){
1855         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1856                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1857                 &weight_op[0], &weight_avg[0],
1858                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1859     }else if(IS_16X8(mb_type)){
1860         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1861                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1862                 &weight_op[1], &weight_avg[1],
1863                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1864         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1865                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1866                 &weight_op[1], &weight_avg[1],
1867                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1868     }else if(IS_8X16(mb_type)){
1869         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1870                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1871                 &weight_op[2], &weight_avg[2],
1872                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1873         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1874                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1875                 &weight_op[2], &weight_avg[2],
1876                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1877     }else{
1878         int i;
1879
1880         assert(IS_8X8(mb_type));
1881
1882         for(i=0; i<4; i++){
1883             const int sub_mb_type= h->sub_mb_type[i];
1884             const int n= 4*i;
1885             int x_offset= (i&1)<<2;
1886             int y_offset= (i&2)<<1;
1887
1888             if(IS_SUB_8X8(sub_mb_type)){
1889                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1890                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1891                     &weight_op[3], &weight_avg[3],
1892                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1893             }else if(IS_SUB_8X4(sub_mb_type)){
1894                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1895                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1896                     &weight_op[4], &weight_avg[4],
1897                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1898                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1899                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1900                     &weight_op[4], &weight_avg[4],
1901                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1902             }else if(IS_SUB_4X8(sub_mb_type)){
1903                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1904                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1905                     &weight_op[5], &weight_avg[5],
1906                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1907                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1908                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1909                     &weight_op[5], &weight_avg[5],
1910                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1911             }else{
1912                 int j;
1913                 assert(IS_SUB_4X4(sub_mb_type));
1914                 for(j=0; j<4; j++){
1915                     int sub_x_offset= x_offset + 2*(j&1);
1916                     int sub_y_offset= y_offset +   (j&2);
1917                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1918                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1919                         &weight_op[6], &weight_avg[6],
1920                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 }
1922             }
1923         }
1924     }
1925
1926     prefetch_motion(h, 1);
1927 }
1928
1929 static av_cold void decode_init_vlc(void){
1930     static int done = 0;
1931
1932     if (!done) {
1933         int i;
1934         int offset;
1935         done = 1;
1936
1937         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1938         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1939         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1940                  &chroma_dc_coeff_token_len [0], 1, 1,
1941                  &chroma_dc_coeff_token_bits[0], 1, 1,
1942                  INIT_VLC_USE_NEW_STATIC);
1943
1944         offset = 0;
1945         for(i=0; i<4; i++){
1946             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1947             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1948             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1949                      &coeff_token_len [i][0], 1, 1,
1950                      &coeff_token_bits[i][0], 1, 1,
1951                      INIT_VLC_USE_NEW_STATIC);
1952             offset += coeff_token_vlc_tables_size[i];
1953         }
1954         /*
1955          * This is a one time safety check to make sure that
1956          * the packed static coeff_token_vlc table sizes
1957          * were initialized correctly.
1958          */
1959         assert(offset == sizeof(coeff_token_vlc_tables)/(sizeof(VLC_TYPE)*2));
1960
1961         for(i=0; i<3; i++){
1962             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1963             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1964             init_vlc(&chroma_dc_total_zeros_vlc[i],
1965                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1966                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1967                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1968                      INIT_VLC_USE_NEW_STATIC);
1969         }
1970         for(i=0; i<15; i++){
1971             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1972             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1973             init_vlc(&total_zeros_vlc[i],
1974                      TOTAL_ZEROS_VLC_BITS, 16,
1975                      &total_zeros_len [i][0], 1, 1,
1976                      &total_zeros_bits[i][0], 1, 1,
1977                      INIT_VLC_USE_NEW_STATIC);
1978         }
1979
1980         for(i=0; i<6; i++){
1981             run_vlc[i].table = run_vlc_tables[i];
1982             run_vlc[i].table_allocated = run_vlc_tables_size;
1983             init_vlc(&run_vlc[i],
1984                      RUN_VLC_BITS, 7,
1985                      &run_len [i][0], 1, 1,
1986                      &run_bits[i][0], 1, 1,
1987                      INIT_VLC_USE_NEW_STATIC);
1988         }
1989         run7_vlc.table = run7_vlc_table,
1990         run7_vlc.table_allocated = run7_vlc_table_size;
1991         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1992                  &run_len [6][0], 1, 1,
1993                  &run_bits[6][0], 1, 1,
1994                  INIT_VLC_USE_NEW_STATIC);
1995     }
1996 }
1997
1998 static void free_tables(H264Context *h){
1999     int i;
2000     H264Context *hx;
2001     av_freep(&h->intra4x4_pred_mode);
2002     av_freep(&h->chroma_pred_mode_table);
2003     av_freep(&h->cbp_table);
2004     av_freep(&h->mvd_table[0]);
2005     av_freep(&h->mvd_table[1]);
2006     av_freep(&h->direct_table);
2007     av_freep(&h->non_zero_count);
2008     av_freep(&h->slice_table_base);
2009     h->slice_table= NULL;
2010
2011     av_freep(&h->mb2b_xy);
2012     av_freep(&h->mb2b8_xy);
2013
2014     for(i = 0; i < MAX_SPS_COUNT; i++)
2015         av_freep(h->sps_buffers + i);
2016
2017     for(i = 0; i < MAX_PPS_COUNT; i++)
2018         av_freep(h->pps_buffers + i);
2019
2020     for(i = 0; i < h->s.avctx->thread_count; i++) {
2021         hx = h->thread_context[i];
2022         if(!hx) continue;
2023         av_freep(&hx->top_borders[1]);
2024         av_freep(&hx->top_borders[0]);
2025         av_freep(&hx->s.obmc_scratchpad);
2026     }
2027 }
2028
2029 static void init_dequant8_coeff_table(H264Context *h){
2030     int i,q,x;
2031     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2032     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2033     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2034
2035     for(i=0; i<2; i++ ){
2036         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2037             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2038             break;
2039         }
2040
2041         for(q=0; q<52; q++){
2042             int shift = ff_div6[q];
2043             int idx = ff_rem6[q];
2044             for(x=0; x<64; x++)
2045                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2046                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2047                     h->pps.scaling_matrix8[i][x]) << shift;
2048         }
2049     }
2050 }
2051
2052 static void init_dequant4_coeff_table(H264Context *h){
2053     int i,j,q,x;
2054     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2055     for(i=0; i<6; i++ ){
2056         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2057         for(j=0; j<i; j++){
2058             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2059                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2060                 break;
2061             }
2062         }
2063         if(j<i)
2064             continue;
2065
2066         for(q=0; q<52; q++){
2067             int shift = ff_div6[q] + 2;
2068             int idx = ff_rem6[q];
2069             for(x=0; x<16; x++)
2070                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2071                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2072                     h->pps.scaling_matrix4[i][x]) << shift;
2073         }
2074     }
2075 }
2076
2077 static void init_dequant_tables(H264Context *h){
2078     int i,x;
2079     init_dequant4_coeff_table(h);
2080     if(h->pps.transform_8x8_mode)
2081         init_dequant8_coeff_table(h);
2082     if(h->sps.transform_bypass){
2083         for(i=0; i<6; i++)
2084             for(x=0; x<16; x++)
2085                 h->dequant4_coeff[i][0][x] = 1<<6;
2086         if(h->pps.transform_8x8_mode)
2087             for(i=0; i<2; i++)
2088                 for(x=0; x<64; x++)
2089                     h->dequant8_coeff[i][0][x] = 1<<6;
2090     }
2091 }
2092
2093
2094 /**
2095  * allocates tables.
2096  * needs width/height
2097  */
2098 static int alloc_tables(H264Context *h){
2099     MpegEncContext * const s = &h->s;
2100     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2101     int x,y;
2102
2103     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2104
2105     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2106     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2107     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2108
2109     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2110     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2111     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2112     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2113
2114     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2115     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2116
2117     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2118     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2119     for(y=0; y<s->mb_height; y++){
2120         for(x=0; x<s->mb_width; x++){
2121             const int mb_xy= x + y*s->mb_stride;
2122             const int b_xy = 4*x + 4*y*h->b_stride;
2123             const int b8_xy= 2*x + 2*y*h->b8_stride;
2124
2125             h->mb2b_xy [mb_xy]= b_xy;
2126             h->mb2b8_xy[mb_xy]= b8_xy;
2127         }
2128     }
2129
2130     s->obmc_scratchpad = NULL;
2131
2132     if(!h->dequant4_coeff[0])
2133         init_dequant_tables(h);
2134
2135     return 0;
2136 fail:
2137     free_tables(h);
2138     return -1;
2139 }
2140
2141 /**
2142  * Mimic alloc_tables(), but for every context thread.
2143  */
2144 static void clone_tables(H264Context *dst, H264Context *src){
2145     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2146     dst->non_zero_count           = src->non_zero_count;
2147     dst->slice_table              = src->slice_table;
2148     dst->cbp_table                = src->cbp_table;
2149     dst->mb2b_xy                  = src->mb2b_xy;
2150     dst->mb2b8_xy                 = src->mb2b8_xy;
2151     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2152     dst->mvd_table[0]             = src->mvd_table[0];
2153     dst->mvd_table[1]             = src->mvd_table[1];
2154     dst->direct_table             = src->direct_table;
2155
2156     dst->s.obmc_scratchpad = NULL;
2157     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2158 }
2159
2160 /**
2161  * Init context
2162  * Allocate buffers which are not shared amongst multiple threads.
2163  */
2164 static int context_init(H264Context *h){
2165     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2166     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2167
2168     return 0;
2169 fail:
2170     return -1; // free_tables will clean up for us
2171 }
2172
2173 static av_cold void common_init(H264Context *h){
2174     MpegEncContext * const s = &h->s;
2175
2176     s->width = s->avctx->width;
2177     s->height = s->avctx->height;
2178     s->codec_id= s->avctx->codec->id;
2179
2180     ff_h264_pred_init(&h->hpc, s->codec_id);
2181
2182     h->dequant_coeff_pps= -1;
2183     s->unrestricted_mv=1;
2184     s->decode=1; //FIXME
2185
2186     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2187     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2188 }
2189
2190 static av_cold int decode_init(AVCodecContext *avctx){
2191     H264Context *h= avctx->priv_data;
2192     MpegEncContext * const s = &h->s;
2193
2194     MPV_decode_defaults(s);
2195
2196     s->avctx = avctx;
2197     common_init(h);
2198
2199     s->out_format = FMT_H264;
2200     s->workaround_bugs= avctx->workaround_bugs;
2201
2202     // set defaults
2203 //    s->decode_mb= ff_h263_decode_mb;
2204     s->quarter_sample = 1;
2205     s->low_delay= 1;
2206
2207     if(avctx->codec_id == CODEC_ID_SVQ3)
2208         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2209     else
2210         avctx->pix_fmt= PIX_FMT_YUV420P;
2211
2212     decode_init_vlc();
2213
2214     if(avctx->extradata_size > 0 && avctx->extradata &&
2215        *(char *)avctx->extradata == 1){
2216         h->is_avc = 1;
2217         h->got_avcC = 0;
2218     } else {
2219         h->is_avc = 0;
2220     }
2221
2222     h->thread_context[0] = h;
2223     h->outputed_poc = INT_MIN;
2224     return 0;
2225 }
2226
2227 static int frame_start(H264Context *h){
2228     MpegEncContext * const s = &h->s;
2229     int i;
2230
2231     if(MPV_frame_start(s, s->avctx) < 0)
2232         return -1;
2233     ff_er_frame_start(s);
2234     /*
2235      * MPV_frame_start uses pict_type to derive key_frame.
2236      * This is incorrect for H.264; IDR markings must be used.
2237      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2238      * See decode_nal_units().
2239      */
2240     s->current_picture_ptr->key_frame= 0;
2241
2242     assert(s->linesize && s->uvlinesize);
2243
2244     for(i=0; i<16; i++){
2245         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2246         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2247     }
2248     for(i=0; i<4; i++){
2249         h->block_offset[16+i]=
2250         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251         h->block_offset[24+16+i]=
2252         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2253     }
2254
2255     /* can't be in alloc_tables because linesize isn't known there.
2256      * FIXME: redo bipred weight to not require extra buffer? */
2257     for(i = 0; i < s->avctx->thread_count; i++)
2258         if(!h->thread_context[i]->s.obmc_scratchpad)
2259             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2260
2261     /* some macroblocks will be accessed before they're available */
2262     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2263         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2264
2265 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2266
2267     // We mark the current picture as non-reference after allocating it, so
2268     // that if we break out due to an error it can be released automatically
2269     // in the next MPV_frame_start().
2270     // SVQ3 as well as most other codecs have only last/next/current and thus
2271     // get released even with set reference, besides SVQ3 and others do not
2272     // mark frames as reference later "naturally".
2273     if(s->codec_id != CODEC_ID_SVQ3)
2274         s->current_picture_ptr->reference= 0;
2275
2276     s->current_picture_ptr->field_poc[0]=
2277     s->current_picture_ptr->field_poc[1]= INT_MAX;
2278     assert(s->current_picture_ptr->long_ref==0);
2279
2280     return 0;
2281 }
2282
2283 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2284     MpegEncContext * const s = &h->s;
2285     int i;
2286
2287     src_y  -=   linesize;
2288     src_cb -= uvlinesize;
2289     src_cr -= uvlinesize;
2290
2291     // There are two lines saved, the line above the the top macroblock of a pair,
2292     // and the line above the bottom macroblock
2293     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2294     for(i=1; i<17; i++){
2295         h->left_border[i]= src_y[15+i*  linesize];
2296     }
2297
2298     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2299     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2300
2301     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2302         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2303         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2304         for(i=1; i<9; i++){
2305             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2306             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2307         }
2308         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2309         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2310     }
2311 }
2312
2313 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2314     MpegEncContext * const s = &h->s;
2315     int temp8, i;
2316     uint64_t temp64;
2317     int deblock_left;
2318     int deblock_top;
2319     int mb_xy;
2320
2321     if(h->deblocking_filter == 2) {
2322         mb_xy = h->mb_xy;
2323         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2324         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2325     } else {
2326         deblock_left = (s->mb_x > 0);
2327         deblock_top =  (s->mb_y > 0);
2328     }
2329
2330     src_y  -=   linesize + 1;
2331     src_cb -= uvlinesize + 1;
2332     src_cr -= uvlinesize + 1;
2333
2334 #define XCHG(a,b,t,xchg)\
2335 t= a;\
2336 if(xchg)\
2337     a= b;\
2338 b= t;
2339
2340     if(deblock_left){
2341         for(i = !deblock_top; i<17; i++){
2342             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2343         }
2344     }
2345
2346     if(deblock_top){
2347         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2348         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2349         if(s->mb_x+1 < s->mb_width){
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2351         }
2352     }
2353
2354     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2355         if(deblock_left){
2356             for(i = !deblock_top; i<9; i++){
2357                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2358                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2359             }
2360         }
2361         if(deblock_top){
2362             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2363             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2364         }
2365     }
2366 }
2367
2368 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2369     MpegEncContext * const s = &h->s;
2370     int i;
2371
2372     src_y  -= 2 *   linesize;
2373     src_cb -= 2 * uvlinesize;
2374     src_cr -= 2 * uvlinesize;
2375
2376     // There are two lines saved, the line above the the top macroblock of a pair,
2377     // and the line above the bottom macroblock
2378     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2379     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2380     for(i=2; i<34; i++){
2381         h->left_border[i]= src_y[15+i*  linesize];
2382     }
2383
2384     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2385     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2386     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2387     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2388
2389     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2390         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2391         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2392         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2393         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2394         for(i=2; i<18; i++){
2395             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2396             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2397         }
2398         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2399         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2400         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2401         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2402     }
2403 }
2404
2405 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2406     MpegEncContext * const s = &h->s;
2407     int temp8, i;
2408     uint64_t temp64;
2409     int deblock_left = (s->mb_x > 0);
2410     int deblock_top  = (s->mb_y > 1);
2411
2412     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2413
2414     src_y  -= 2 *   linesize + 1;
2415     src_cb -= 2 * uvlinesize + 1;
2416     src_cr -= 2 * uvlinesize + 1;
2417
2418 #define XCHG(a,b,t,xchg)\
2419 t= a;\
2420 if(xchg)\
2421     a= b;\
2422 b= t;
2423
2424     if(deblock_left){
2425         for(i = (!deblock_top)<<1; i<34; i++){
2426             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2427         }
2428     }
2429
2430     if(deblock_top){
2431         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2432         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2433         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2434         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2435         if(s->mb_x+1 < s->mb_width){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2438         }
2439     }
2440
2441     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2442         if(deblock_left){
2443             for(i = (!deblock_top) << 1; i<18; i++){
2444                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2445                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2446             }
2447         }
2448         if(deblock_top){
2449             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2450             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2451             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2452             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2453         }
2454     }
2455 }
2456
2457 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2458     MpegEncContext * const s = &h->s;
2459     const int mb_x= s->mb_x;
2460     const int mb_y= s->mb_y;
2461     const int mb_xy= h->mb_xy;
2462     const int mb_type= s->current_picture.mb_type[mb_xy];
2463     uint8_t  *dest_y, *dest_cb, *dest_cr;
2464     int linesize, uvlinesize /*dct_offset*/;
2465     int i;
2466     int *block_offset = &h->block_offset[0];
2467     const unsigned int bottom = mb_y & 1;
2468     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2469     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2470     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2471
2472     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2473     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2474     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2475
2476     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2477     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2478
2479     if (!simple && MB_FIELD) {
2480         linesize   = h->mb_linesize   = s->linesize * 2;
2481         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2482         block_offset = &h->block_offset[24];
2483         if(mb_y&1){ //FIXME move out of this function?
2484             dest_y -= s->linesize*15;
2485             dest_cb-= s->uvlinesize*7;
2486             dest_cr-= s->uvlinesize*7;
2487         }
2488         if(FRAME_MBAFF) {
2489             int list;
2490             for(list=0; list<h->list_count; list++){
2491                 if(!USES_LIST(mb_type, list))
2492                     continue;
2493                 if(IS_16X16(mb_type)){
2494                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2495                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2496                 }else{
2497                     for(i=0; i<16; i+=4){
2498                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2499                         int ref = h->ref_cache[list][scan8[i]];
2500                         if(ref >= 0)
2501                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2502                     }
2503                 }
2504             }
2505         }
2506     } else {
2507         linesize   = h->mb_linesize   = s->linesize;
2508         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2509 //        dct_offset = s->linesize * 16;
2510     }
2511
2512     if(transform_bypass){
2513         idct_dc_add =
2514         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2515     }else if(IS_8x8DCT(mb_type)){
2516         idct_dc_add = s->dsp.h264_idct8_dc_add;
2517         idct_add = s->dsp.h264_idct8_add;
2518     }else{
2519         idct_dc_add = s->dsp.h264_idct_dc_add;
2520         idct_add = s->dsp.h264_idct_add;
2521     }
2522
2523     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2524        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2525         int mbt_y = mb_y&~1;
2526         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2527         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2528         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2529         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2530     }
2531
2532     if (!simple && IS_INTRA_PCM(mb_type)) {
2533         for (i=0; i<16; i++) {
2534             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2535         }
2536         for (i=0; i<8; i++) {
2537             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2538             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2539         }
2540     } else {
2541         if(IS_INTRA(mb_type)){
2542             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2543                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2544
2545             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2546                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2547                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2548             }
2549
2550             if(IS_INTRA4x4(mb_type)){
2551                 if(simple || !s->encoding){
2552                     if(IS_8x8DCT(mb_type)){
2553                         for(i=0; i<16; i+=4){
2554                             uint8_t * const ptr= dest_y + block_offset[i];
2555                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2556                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2557                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2558                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2559                             if(nnz){
2560                                 if(nnz == 1 && h->mb[i*16])
2561                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2562                                 else
2563                                     idct_add(ptr, h->mb + i*16, linesize);
2564                             }
2565                         }
2566                     }else
2567                     for(i=0; i<16; i++){
2568                         uint8_t * const ptr= dest_y + block_offset[i];
2569                         uint8_t *topright;
2570                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2571                         int nnz, tr;
2572
2573                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2574                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2575                             assert(mb_y || linesize <= block_offset[i]);
2576                             if(!topright_avail){
2577                                 tr= ptr[3 - linesize]*0x01010101;
2578                                 topright= (uint8_t*) &tr;
2579                             }else
2580                                 topright= ptr + 4 - linesize;
2581                         }else
2582                             topright= NULL;
2583
2584                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2585                         nnz = h->non_zero_count_cache[ scan8[i] ];
2586                         if(nnz){
2587                             if(is_h264){
2588                                 if(nnz == 1 && h->mb[i*16])
2589                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2590                                 else
2591                                     idct_add(ptr, h->mb + i*16, linesize);
2592                             }else
2593                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2594                         }
2595                     }
2596                 }
2597             }else{
2598                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2599                 if(is_h264){
2600                     if(!transform_bypass)
2601                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2602                 }else
2603                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2604             }
2605             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2606                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2607         }else if(is_h264){
2608             hl_motion(h, dest_y, dest_cb, dest_cr,
2609                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2610                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2611                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2612         }
2613
2614
2615         if(!IS_INTRA4x4(mb_type)){
2616             if(is_h264){
2617                 if(IS_INTRA16x16(mb_type)){
2618                     for(i=0; i<16; i++){
2619                         if(h->non_zero_count_cache[ scan8[i] ])
2620                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2621                         else if(h->mb[i*16])
2622                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2623                     }
2624                 }else{
2625                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2626                     for(i=0; i<16; i+=di){
2627                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2628                         if(nnz){
2629                             if(nnz==1 && h->mb[i*16])
2630                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2631                             else
2632                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2633                         }
2634                     }
2635                 }
2636             }else{
2637                 for(i=0; i<16; i++){
2638                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2639                         uint8_t * const ptr= dest_y + block_offset[i];
2640                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2641                     }
2642                 }
2643             }
2644         }
2645
2646         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2647             uint8_t *dest[2] = {dest_cb, dest_cr};
2648             if(transform_bypass){
2649                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2650             }else{
2651                 idct_add = s->dsp.h264_idct_add;
2652                 idct_dc_add = s->dsp.h264_idct_dc_add;
2653                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2654                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2655             }
2656             if(is_h264){
2657                 for(i=16; i<16+8; i++){
2658                     if(h->non_zero_count_cache[ scan8[i] ])
2659                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2660                     else if(h->mb[i*16])
2661                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2662                 }
2663             }else{
2664                 for(i=16; i<16+8; i++){
2665                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2666                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2667                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2668                     }
2669                 }
2670             }
2671         }
2672     }
2673     if(h->deblocking_filter) {
2674         if (!simple && FRAME_MBAFF) {
2675             //FIXME try deblocking one mb at a time?
2676             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2677             const int mb_y = s->mb_y - 1;
2678             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2679             const int mb_xy= mb_x + mb_y*s->mb_stride;
2680             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2681             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2682             if (!bottom) return;
2683             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2684             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2685             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2686
2687             if(IS_INTRA(mb_type_top | mb_type_bottom))
2688                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2689
2690             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2691             // deblock a pair
2692             // top
2693             s->mb_y--; h->mb_xy -= s->mb_stride;
2694             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2695             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2696             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2697             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2698             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2699             // bottom
2700             s->mb_y++; h->mb_xy += s->mb_stride;
2701             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2702             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2703             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2704             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2705             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2706         } else {
2707             tprintf(h->s.avctx, "call filter_mb\n");
2708             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2709             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2710             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2711             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2712             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2713         }
2714     }
2715 }
2716
2717 /**
2718  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2719  */
2720 static void hl_decode_mb_simple(H264Context *h){
2721     hl_decode_mb_internal(h, 1);
2722 }
2723
2724 /**
2725  * Process a macroblock; this handles edge cases, such as interlacing.
2726  */
2727 static void av_noinline hl_decode_mb_complex(H264Context *h){
2728     hl_decode_mb_internal(h, 0);
2729 }
2730
2731 static void hl_decode_mb(H264Context *h){
2732     MpegEncContext * const s = &h->s;
2733     const int mb_xy= h->mb_xy;
2734     const int mb_type= s->current_picture.mb_type[mb_xy];
2735     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2736                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2737
2738     if(ENABLE_H264_ENCODER && !s->decode)
2739         return;
2740
2741     if (is_complex)
2742         hl_decode_mb_complex(h);
2743     else hl_decode_mb_simple(h);
2744 }
2745
2746 static void pic_as_field(Picture *pic, const int parity){
2747     int i;
2748     for (i = 0; i < 4; ++i) {
2749         if (parity == PICT_BOTTOM_FIELD)
2750             pic->data[i] += pic->linesize[i];
2751         pic->reference = parity;
2752         pic->linesize[i] *= 2;
2753     }
2754     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2755 }
2756
2757 static int split_field_copy(Picture *dest, Picture *src,
2758                             int parity, int id_add){
2759     int match = !!(src->reference & parity);
2760
2761     if (match) {
2762         *dest = *src;
2763         if(parity != PICT_FRAME){
2764             pic_as_field(dest, parity);
2765             dest->pic_id *= 2;
2766             dest->pic_id += id_add;
2767         }
2768     }
2769
2770     return match;
2771 }
2772
2773 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2774     int i[2]={0};
2775     int index=0;
2776
2777     while(i[0]<len || i[1]<len){
2778         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2779             i[0]++;
2780         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2781             i[1]++;
2782         if(i[0] < len){
2783             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2784             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2785         }
2786         if(i[1] < len){
2787             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2788             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2789         }
2790     }
2791
2792     return index;
2793 }
2794
2795 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2796     int i, best_poc;
2797     int out_i= 0;
2798
2799     for(;;){
2800         best_poc= dir ? INT_MIN : INT_MAX;
2801
2802         for(i=0; i<len; i++){
2803             const int poc= src[i]->poc;
2804             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2805                 best_poc= poc;
2806                 sorted[out_i]= src[i];
2807             }
2808         }
2809         if(best_poc == (dir ? INT_MIN : INT_MAX))
2810             break;
2811         limit= sorted[out_i++]->poc - dir;
2812     }
2813     return out_i;
2814 }
2815
2816 /**
2817  * fills the default_ref_list.
2818  */
2819 static int fill_default_ref_list(H264Context *h){
2820     MpegEncContext * const s = &h->s;
2821     int i, len;
2822
2823     if(h->slice_type_nos==FF_B_TYPE){
2824         Picture *sorted[32];
2825         int cur_poc, list;
2826         int lens[2];
2827
2828         if(FIELD_PICTURE)
2829             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2830         else
2831             cur_poc= s->current_picture_ptr->poc;
2832
2833         for(list= 0; list<2; list++){
2834             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2835             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2836             assert(len<=32);
2837             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2838             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2839             assert(len<=32);
2840
2841             if(len < h->ref_count[list])
2842                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2843             lens[list]= len;
2844         }
2845
2846         if(lens[0] == lens[1] && lens[1] > 1){
2847             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2848             if(i == lens[0])
2849                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2850         }
2851     }else{
2852         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2853         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2854         assert(len <= 32);
2855         if(len < h->ref_count[0])
2856             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2857     }
2858 #ifdef TRACE
2859     for (i=0; i<h->ref_count[0]; i++) {
2860         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2861     }
2862     if(h->slice_type_nos==FF_B_TYPE){
2863         for (i=0; i<h->ref_count[1]; i++) {
2864             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2865         }
2866     }
2867 #endif
2868     return 0;
2869 }
2870
2871 static void print_short_term(H264Context *h);
2872 static void print_long_term(H264Context *h);
2873
2874 /**
2875  * Extract structure information about the picture described by pic_num in
2876  * the current decoding context (frame or field). Note that pic_num is
2877  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2878  * @param pic_num picture number for which to extract structure information
2879  * @param structure one of PICT_XXX describing structure of picture
2880  *                      with pic_num
2881  * @return frame number (short term) or long term index of picture
2882  *         described by pic_num
2883  */
2884 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2885     MpegEncContext * const s = &h->s;
2886
2887     *structure = s->picture_structure;
2888     if(FIELD_PICTURE){
2889         if (!(pic_num & 1))
2890             /* opposite field */
2891             *structure ^= PICT_FRAME;
2892         pic_num >>= 1;
2893     }
2894
2895     return pic_num;
2896 }
2897
2898 static int decode_ref_pic_list_reordering(H264Context *h){
2899     MpegEncContext * const s = &h->s;
2900     int list, index, pic_structure;
2901
2902     print_short_term(h);
2903     print_long_term(h);
2904
2905     for(list=0; list<h->list_count; list++){
2906         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2907
2908         if(get_bits1(&s->gb)){
2909             int pred= h->curr_pic_num;
2910
2911             for(index=0; ; index++){
2912                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2913                 unsigned int pic_id;
2914                 int i;
2915                 Picture *ref = NULL;
2916
2917                 if(reordering_of_pic_nums_idc==3)
2918                     break;
2919
2920                 if(index >= h->ref_count[list]){
2921                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2922                     return -1;
2923                 }
2924
2925                 if(reordering_of_pic_nums_idc<3){
2926                     if(reordering_of_pic_nums_idc<2){
2927                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2928                         int frame_num;
2929
2930                         if(abs_diff_pic_num > h->max_pic_num){
2931                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2932                             return -1;
2933                         }
2934
2935                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2936                         else                                pred+= abs_diff_pic_num;
2937                         pred &= h->max_pic_num - 1;
2938
2939                         frame_num = pic_num_extract(h, pred, &pic_structure);
2940
2941                         for(i= h->short_ref_count-1; i>=0; i--){
2942                             ref = h->short_ref[i];
2943                             assert(ref->reference);
2944                             assert(!ref->long_ref);
2945                             if(
2946                                    ref->frame_num == frame_num &&
2947                                    (ref->reference & pic_structure)
2948                               )
2949                                 break;
2950                         }
2951                         if(i>=0)
2952                             ref->pic_id= pred;
2953                     }else{
2954                         int long_idx;
2955                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2956
2957                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2958
2959                         if(long_idx>31){
2960                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2961                             return -1;
2962                         }
2963                         ref = h->long_ref[long_idx];
2964                         assert(!(ref && !ref->reference));
2965                         if(ref && (ref->reference & pic_structure)){
2966                             ref->pic_id= pic_id;
2967                             assert(ref->long_ref);
2968                             i=0;
2969                         }else{
2970                             i=-1;
2971                         }
2972                     }
2973
2974                     if (i < 0) {
2975                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2976                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2977                     } else {
2978                         for(i=index; i+1<h->ref_count[list]; i++){
2979                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2980                                 break;
2981                         }
2982                         for(; i > index; i--){
2983                             h->ref_list[list][i]= h->ref_list[list][i-1];
2984                         }
2985                         h->ref_list[list][index]= *ref;
2986                         if (FIELD_PICTURE){
2987                             pic_as_field(&h->ref_list[list][index], pic_structure);
2988                         }
2989                     }
2990                 }else{
2991                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2992                     return -1;
2993                 }
2994             }
2995         }
2996     }
2997     for(list=0; list<h->list_count; list++){
2998         for(index= 0; index < h->ref_count[list]; index++){
2999             if(!h->ref_list[list][index].data[0]){
3000                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
3001                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
3002             }
3003         }
3004     }
3005
3006     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3007         direct_dist_scale_factor(h);
3008     direct_ref_list_init(h);
3009     return 0;
3010 }
3011
3012 static void fill_mbaff_ref_list(H264Context *h){
3013     int list, i, j;
3014     for(list=0; list<2; list++){ //FIXME try list_count
3015         for(i=0; i<h->ref_count[list]; i++){
3016             Picture *frame = &h->ref_list[list][i];
3017             Picture *field = &h->ref_list[list][16+2*i];
3018             field[0] = *frame;
3019             for(j=0; j<3; j++)
3020                 field[0].linesize[j] <<= 1;
3021             field[0].reference = PICT_TOP_FIELD;
3022             field[1] = field[0];
3023             for(j=0; j<3; j++)
3024                 field[1].data[j] += frame->linesize[j];
3025             field[1].reference = PICT_BOTTOM_FIELD;
3026
3027             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3028             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3029             for(j=0; j<2; j++){
3030                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3031                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3032             }
3033         }
3034     }
3035     for(j=0; j<h->ref_count[1]; j++){
3036         for(i=0; i<h->ref_count[0]; i++)
3037             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3038         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3039         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3040     }
3041 }
3042
3043 static int pred_weight_table(H264Context *h){
3044     MpegEncContext * const s = &h->s;
3045     int list, i;
3046     int luma_def, chroma_def;
3047
3048     h->use_weight= 0;
3049     h->use_weight_chroma= 0;
3050     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3051     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3052     luma_def = 1<<h->luma_log2_weight_denom;
3053     chroma_def = 1<<h->chroma_log2_weight_denom;
3054
3055     for(list=0; list<2; list++){
3056         for(i=0; i<h->ref_count[list]; i++){
3057             int luma_weight_flag, chroma_weight_flag;
3058
3059             luma_weight_flag= get_bits1(&s->gb);
3060             if(luma_weight_flag){
3061                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3062                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3063                 if(   h->luma_weight[list][i] != luma_def
3064                    || h->luma_offset[list][i] != 0)
3065                     h->use_weight= 1;
3066             }else{
3067                 h->luma_weight[list][i]= luma_def;
3068                 h->luma_offset[list][i]= 0;
3069             }
3070
3071             if(CHROMA){
3072                 chroma_weight_flag= get_bits1(&s->gb);
3073                 if(chroma_weight_flag){
3074                     int j;
3075                     for(j=0; j<2; j++){
3076                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3077                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3078                         if(   h->chroma_weight[list][i][j] != chroma_def
3079                         || h->chroma_offset[list][i][j] != 0)
3080                             h->use_weight_chroma= 1;
3081                     }
3082                 }else{
3083                     int j;
3084                     for(j=0; j<2; j++){
3085                         h->chroma_weight[list][i][j]= chroma_def;
3086                         h->chroma_offset[list][i][j]= 0;
3087                     }
3088                 }
3089             }
3090         }
3091         if(h->slice_type_nos != FF_B_TYPE) break;
3092     }
3093     h->use_weight= h->use_weight || h->use_weight_chroma;
3094     return 0;
3095 }
3096
3097 static void implicit_weight_table(H264Context *h){
3098     MpegEncContext * const s = &h->s;
3099     int ref0, ref1;
3100     int cur_poc = s->current_picture_ptr->poc;
3101
3102     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3103        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3104         h->use_weight= 0;
3105         h->use_weight_chroma= 0;
3106         return;
3107     }
3108
3109     h->use_weight= 2;
3110     h->use_weight_chroma= 2;
3111     h->luma_log2_weight_denom= 5;
3112     h->chroma_log2_weight_denom= 5;
3113
3114     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3115         int poc0 = h->ref_list[0][ref0].poc;
3116         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3117             int poc1 = h->ref_list[1][ref1].poc;
3118             int td = av_clip(poc1 - poc0, -128, 127);
3119             if(td){
3120                 int tb = av_clip(cur_poc - poc0, -128, 127);
3121                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3122                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3123                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3124                     h->implicit_weight[ref0][ref1] = 32;
3125                 else
3126                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3127             }else
3128                 h->implicit_weight[ref0][ref1] = 32;
3129         }
3130     }
3131 }
3132
3133 /**
3134  * Mark a picture as no longer needed for reference. The refmask
3135  * argument allows unreferencing of individual fields or the whole frame.
3136  * If the picture becomes entirely unreferenced, but is being held for
3137  * display purposes, it is marked as such.
3138  * @param refmask mask of fields to unreference; the mask is bitwise
3139  *                anded with the reference marking of pic
3140  * @return non-zero if pic becomes entirely unreferenced (except possibly
3141  *         for display purposes) zero if one of the fields remains in
3142  *         reference
3143  */
3144 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3145     int i;
3146     if (pic->reference &= refmask) {
3147         return 0;
3148     } else {
3149         for(i = 0; h->delayed_pic[i]; i++)
3150             if(pic == h->delayed_pic[i]){
3151                 pic->reference=DELAYED_PIC_REF;
3152                 break;
3153             }
3154         return 1;
3155     }
3156 }
3157
3158 /**
3159  * instantaneous decoder refresh.
3160  */
3161 static void idr(H264Context *h){
3162     int i;
3163
3164     for(i=0; i<16; i++){
3165         remove_long(h, i, 0);
3166     }
3167     assert(h->long_ref_count==0);
3168
3169     for(i=0; i<h->short_ref_count; i++){
3170         unreference_pic(h, h->short_ref[i], 0);
3171         h->short_ref[i]= NULL;
3172     }
3173     h->short_ref_count=0;
3174     h->prev_frame_num= 0;
3175     h->prev_frame_num_offset= 0;
3176     h->prev_poc_msb=
3177     h->prev_poc_lsb= 0;
3178 }
3179
3180 /* forget old pics after a seek */
3181 static void flush_dpb(AVCodecContext *avctx){
3182     H264Context *h= avctx->priv_data;
3183     int i;
3184     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3185         if(h->delayed_pic[i])
3186             h->delayed_pic[i]->reference= 0;
3187         h->delayed_pic[i]= NULL;
3188     }
3189     h->outputed_poc= INT_MIN;
3190     idr(h);
3191     if(h->s.current_picture_ptr)
3192         h->s.current_picture_ptr->reference= 0;
3193     h->s.first_field= 0;
3194     ff_mpeg_flush(avctx);
3195 }
3196
3197 /**
3198  * Find a Picture in the short term reference list by frame number.
3199  * @param frame_num frame number to search for
3200  * @param idx the index into h->short_ref where returned picture is found
3201  *            undefined if no picture found.
3202  * @return pointer to the found picture, or NULL if no pic with the provided
3203  *                 frame number is found
3204  */
3205 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3206     MpegEncContext * const s = &h->s;
3207     int i;
3208
3209     for(i=0; i<h->short_ref_count; i++){
3210         Picture *pic= h->short_ref[i];
3211         if(s->avctx->debug&FF_DEBUG_MMCO)
3212             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3213         if(pic->frame_num == frame_num) {
3214             *idx = i;
3215             return pic;
3216         }
3217     }
3218     return NULL;
3219 }
3220
3221 /**
3222  * Remove a picture from the short term reference list by its index in
3223  * that list.  This does no checking on the provided index; it is assumed
3224  * to be valid. Other list entries are shifted down.
3225  * @param i index into h->short_ref of picture to remove.
3226  */
3227 static void remove_short_at_index(H264Context *h, int i){
3228     assert(i >= 0 && i < h->short_ref_count);
3229     h->short_ref[i]= NULL;
3230     if (--h->short_ref_count)
3231         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3232 }
3233
3234 /**
3235  *
3236  * @return the removed picture or NULL if an error occurs
3237  */
3238 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3239     MpegEncContext * const s = &h->s;
3240     Picture *pic;
3241     int i;
3242
3243     if(s->avctx->debug&FF_DEBUG_MMCO)
3244         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3245
3246     pic = find_short(h, frame_num, &i);
3247     if (pic){
3248         if(unreference_pic(h, pic, ref_mask))
3249         remove_short_at_index(h, i);
3250     }
3251
3252     return pic;
3253 }
3254
3255 /**
3256  * Remove a picture from the long term reference list by its index in
3257  * that list.
3258  * @return the removed picture or NULL if an error occurs
3259  */
3260 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3261     Picture *pic;
3262
3263     pic= h->long_ref[i];
3264     if (pic){
3265         if(unreference_pic(h, pic, ref_mask)){
3266             assert(h->long_ref[i]->long_ref == 1);
3267             h->long_ref[i]->long_ref= 0;
3268             h->long_ref[i]= NULL;
3269             h->long_ref_count--;
3270         }
3271     }
3272
3273     return pic;
3274 }
3275
3276 /**
3277  * print short term list
3278  */
3279 static void print_short_term(H264Context *h) {
3280     uint32_t i;
3281     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3282         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3283         for(i=0; i<h->short_ref_count; i++){
3284             Picture *pic= h->short_ref[i];
3285             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3286         }
3287     }
3288 }
3289
3290 /**
3291  * print long term list
3292  */
3293 static void print_long_term(H264Context *h) {
3294     uint32_t i;
3295     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3296         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3297         for(i = 0; i < 16; i++){
3298             Picture *pic= h->long_ref[i];
3299             if (pic) {
3300                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3301             }
3302         }
3303     }
3304 }
3305
3306 /**
3307  * Executes the reference picture marking (memory management control operations).
3308  */
3309 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3310     MpegEncContext * const s = &h->s;
3311     int i, j;
3312     int current_ref_assigned=0;
3313     Picture *pic;
3314
3315     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3316         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3317
3318     for(i=0; i<mmco_count; i++){
3319         int structure, frame_num;
3320         if(s->avctx->debug&FF_DEBUG_MMCO)
3321             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3322
3323         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3324            || mmco[i].opcode == MMCO_SHORT2LONG){
3325             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3326             pic = find_short(h, frame_num, &j);
3327             if(!pic){
3328                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3329                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3330                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3331                 continue;
3332             }
3333         }
3334
3335         switch(mmco[i].opcode){
3336         case MMCO_SHORT2UNUSED:
3337             if(s->avctx->debug&FF_DEBUG_MMCO)
3338                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3339             remove_short(h, frame_num, structure ^ PICT_FRAME);
3340             break;
3341         case MMCO_SHORT2LONG:
3342                 if (h->long_ref[mmco[i].long_arg] != pic)
3343                     remove_long(h, mmco[i].long_arg, 0);
3344
3345                 remove_short_at_index(h, j);
3346                 h->long_ref[ mmco[i].long_arg ]= pic;
3347                 if (h->long_ref[ mmco[i].long_arg ]){
3348                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3349                     h->long_ref_count++;
3350                 }
3351             break;
3352         case MMCO_LONG2UNUSED:
3353             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3354             pic = h->long_ref[j];
3355             if (pic) {
3356                 remove_long(h, j, structure ^ PICT_FRAME);
3357             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3358                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3359             break;
3360         case MMCO_LONG:
3361                     // Comment below left from previous code as it is an interresting note.
3362                     /* First field in pair is in short term list or
3363                      * at a different long term index.
3364                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3365                      * Report the problem and keep the pair where it is,
3366                      * and mark this field valid.
3367                      */
3368
3369             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3370                 remove_long(h, mmco[i].long_arg, 0);
3371
3372                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3373                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3374                 h->long_ref_count++;
3375             }
3376
3377             s->current_picture_ptr->reference |= s->picture_structure;
3378             current_ref_assigned=1;
3379             break;
3380         case MMCO_SET_MAX_LONG:
3381             assert(mmco[i].long_arg <= 16);
3382             // just remove the long term which index is greater than new max
3383             for(j = mmco[i].long_arg; j<16; j++){
3384                 remove_long(h, j, 0);
3385             }
3386             break;
3387         case MMCO_RESET:
3388             while(h->short_ref_count){
3389                 remove_short(h, h->short_ref[0]->frame_num, 0);
3390             }
3391             for(j = 0; j < 16; j++) {
3392                 remove_long(h, j, 0);
3393             }
3394             s->current_picture_ptr->poc=
3395             s->current_picture_ptr->field_poc[0]=
3396             s->current_picture_ptr->field_poc[1]=
3397             h->poc_lsb=
3398             h->poc_msb=
3399             h->frame_num=
3400             s->current_picture_ptr->frame_num= 0;
3401             break;
3402         default: assert(0);
3403         }
3404     }
3405
3406     if (!current_ref_assigned) {
3407         /* Second field of complementary field pair; the first field of
3408          * which is already referenced. If short referenced, it
3409          * should be first entry in short_ref. If not, it must exist
3410          * in long_ref; trying to put it on the short list here is an
3411          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3412          */
3413         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3414             /* Just mark the second field valid */
3415             s->current_picture_ptr->reference = PICT_FRAME;
3416         } else if (s->current_picture_ptr->long_ref) {
3417             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3418                                              "assignment for second field "
3419                                              "in complementary field pair "
3420                                              "(first field is long term)\n");
3421         } else {
3422             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3423             if(pic){
3424                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3425             }
3426
3427             if(h->short_ref_count)
3428                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3429
3430             h->short_ref[0]= s->current_picture_ptr;
3431             h->short_ref_count++;
3432             s->current_picture_ptr->reference |= s->picture_structure;
3433         }
3434     }
3435
3436     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3437
3438         /* We have too many reference frames, probably due to corrupted
3439          * stream. Need to discard one frame. Prevents overrun of the
3440          * short_ref and long_ref buffers.
3441          */
3442         av_log(h->s.avctx, AV_LOG_ERROR,
3443                "number of reference frames exceeds max (probably "
3444                "corrupt input), discarding one\n");
3445
3446         if (h->long_ref_count && !h->short_ref_count) {
3447             for (i = 0; i < 16; ++i)
3448                 if (h->long_ref[i])
3449                     break;
3450
3451             assert(i < 16);
3452             remove_long(h, i, 0);
3453         } else {
3454             pic = h->short_ref[h->short_ref_count - 1];
3455             remove_short(h, pic->frame_num, 0);
3456         }
3457     }
3458
3459     print_short_term(h);
3460     print_long_term(h);
3461     return 0;
3462 }
3463
3464 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3465     MpegEncContext * const s = &h->s;
3466     int i;
3467
3468     h->mmco_index= 0;
3469     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3470         s->broken_link= get_bits1(gb) -1;
3471         if(get_bits1(gb)){
3472             h->mmco[0].opcode= MMCO_LONG;
3473             h->mmco[0].long_arg= 0;
3474             h->mmco_index= 1;
3475         }
3476     }else{
3477         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3478             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3479                 MMCOOpcode opcode= get_ue_golomb(gb);
3480
3481                 h->mmco[i].opcode= opcode;
3482                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3483                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3484 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3485                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3486                         return -1;
3487                     }*/
3488                 }
3489                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3490                     unsigned int long_arg= get_ue_golomb(gb);
3491                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3492                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3493                         return -1;
3494                     }
3495                     h->mmco[i].long_arg= long_arg;
3496                 }
3497
3498                 if(opcode > (unsigned)MMCO_LONG){
3499                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3500                     return -1;
3501                 }
3502                 if(opcode == MMCO_END)
3503                     break;
3504             }
3505             h->mmco_index= i;
3506         }else{
3507             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3508
3509             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3510                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3511                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3512                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3513                 h->mmco_index= 1;
3514                 if (FIELD_PICTURE) {
3515                     h->mmco[0].short_pic_num *= 2;
3516                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3517                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3518                     h->mmco_index= 2;
3519                 }
3520             }
3521         }
3522     }
3523
3524     return 0;
3525 }
3526
3527 static int init_poc(H264Context *h){
3528     MpegEncContext * const s = &h->s;
3529     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3530     int field_poc[2];
3531     Picture *cur = s->current_picture_ptr;
3532
3533     h->frame_num_offset= h->prev_frame_num_offset;
3534     if(h->frame_num < h->prev_frame_num)
3535         h->frame_num_offset += max_frame_num;
3536
3537     if(h->sps.poc_type==0){
3538         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3539
3540         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3541             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3542         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3543             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3544         else
3545             h->poc_msb = h->prev_poc_msb;
3546 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3547         field_poc[0] =
3548         field_poc[1] = h->poc_msb + h->poc_lsb;
3549         if(s->picture_structure == PICT_FRAME)
3550             field_poc[1] += h->delta_poc_bottom;
3551     }else if(h->sps.poc_type==1){
3552         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3553         int i;
3554
3555         if(h->sps.poc_cycle_length != 0)
3556             abs_frame_num = h->frame_num_offset + h->frame_num;
3557         else
3558             abs_frame_num = 0;
3559
3560         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3561             abs_frame_num--;
3562
3563         expected_delta_per_poc_cycle = 0;
3564         for(i=0; i < h->sps.poc_cycle_length; i++)
3565             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3566
3567         if(abs_frame_num > 0){
3568             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3569             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3570
3571             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3572             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3573                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3574         } else
3575             expectedpoc = 0;
3576
3577         if(h->nal_ref_idc == 0)
3578             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3579
3580         field_poc[0] = expectedpoc + h->delta_poc[0];
3581         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3582
3583         if(s->picture_structure == PICT_FRAME)
3584             field_poc[1] += h->delta_poc[1];
3585     }else{
3586         int poc= 2*(h->frame_num_offset + h->frame_num);
3587
3588         if(!h->nal_ref_idc)
3589             poc--;
3590
3591         field_poc[0]= poc;
3592         field_poc[1]= poc;
3593     }
3594
3595     if(s->picture_structure != PICT_BOTTOM_FIELD)
3596         s->current_picture_ptr->field_poc[0]= field_poc[0];
3597     if(s->picture_structure != PICT_TOP_FIELD)
3598         s->current_picture_ptr->field_poc[1]= field_poc[1];
3599     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3600
3601     return 0;
3602 }
3603
3604
3605 /**
3606  * initialize scan tables
3607  */
3608 static void init_scan_tables(H264Context *h){
3609     MpegEncContext * const s = &h->s;
3610     int i;
3611     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3612         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3613         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3614     }else{
3615         for(i=0; i<16; i++){
3616 #define T(x) (x>>2) | ((x<<2) & 0xF)
3617             h->zigzag_scan[i] = T(zigzag_scan[i]);
3618             h-> field_scan[i] = T( field_scan[i]);
3619 #undef T
3620         }
3621     }
3622     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3623         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3624         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3625         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3626         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3627     }else{
3628         for(i=0; i<64; i++){
3629 #define T(x) (x>>3) | ((x&7)<<3)
3630             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3631             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3632             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3633             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3634 #undef T
3635         }
3636     }
3637     if(h->sps.transform_bypass){ //FIXME same ugly
3638         h->zigzag_scan_q0          = zigzag_scan;
3639         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3640         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3641         h->field_scan_q0           = field_scan;
3642         h->field_scan8x8_q0        = field_scan8x8;
3643         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3644     }else{
3645         h->zigzag_scan_q0          = h->zigzag_scan;
3646         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3647         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3648         h->field_scan_q0           = h->field_scan;
3649         h->field_scan8x8_q0        = h->field_scan8x8;
3650         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3651     }
3652 }
3653
3654 /**
3655  * Replicates H264 "master" context to thread contexts.
3656  */
3657 static void clone_slice(H264Context *dst, H264Context *src)
3658 {
3659     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3660     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3661     dst->s.current_picture      = src->s.current_picture;
3662     dst->s.linesize             = src->s.linesize;
3663     dst->s.uvlinesize           = src->s.uvlinesize;
3664     dst->s.first_field          = src->s.first_field;
3665
3666     dst->prev_poc_msb           = src->prev_poc_msb;
3667     dst->prev_poc_lsb           = src->prev_poc_lsb;
3668     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3669     dst->prev_frame_num         = src->prev_frame_num;
3670     dst->short_ref_count        = src->short_ref_count;
3671
3672     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3673     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3674     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3675     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3676
3677     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3678     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3679 }
3680
3681 /**
3682  * decodes a slice header.
3683  * This will also call MPV_common_init() and frame_start() as needed.
3684  *
3685  * @param h h264context
3686  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3687  *
3688  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3689  */
3690 static int decode_slice_header(H264Context *h, H264Context *h0){
3691     MpegEncContext * const s = &h->s;
3692     MpegEncContext * const s0 = &h0->s;
3693     unsigned int first_mb_in_slice;
3694     unsigned int pps_id;
3695     int num_ref_idx_active_override_flag;
3696     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3697     unsigned int slice_type, tmp, i, j;
3698     int default_ref_list_done = 0;
3699     int last_pic_structure;
3700
3701     s->dropable= h->nal_ref_idc == 0;
3702
3703     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3704         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3705         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3706     }else{
3707         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3708         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3709     }
3710
3711     first_mb_in_slice= get_ue_golomb(&s->gb);
3712
3713     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3714         h0->current_slice = 0;
3715         if (!s0->first_field)
3716             s->current_picture_ptr= NULL;
3717     }
3718
3719     slice_type= get_ue_golomb(&s->gb);
3720     if(slice_type > 9){
3721         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3722         return -1;
3723     }
3724     if(slice_type > 4){
3725         slice_type -= 5;
3726         h->slice_type_fixed=1;
3727     }else
3728         h->slice_type_fixed=0;
3729
3730     slice_type= slice_type_map[ slice_type ];
3731     if (slice_type == FF_I_TYPE
3732         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3733         default_ref_list_done = 1;
3734     }
3735     h->slice_type= slice_type;
3736     h->slice_type_nos= slice_type & 3;
3737
3738     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3739     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3740         av_log(h->s.avctx, AV_LOG_ERROR,
3741                "B picture before any references, skipping\n");
3742         return -1;
3743     }
3744
3745     pps_id= get_ue_golomb(&s->gb);
3746     if(pps_id>=MAX_PPS_COUNT){
3747         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3748         return -1;
3749     }
3750     if(!h0->pps_buffers[pps_id]) {
3751         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3752         return -1;
3753     }
3754     h->pps= *h0->pps_buffers[pps_id];
3755
3756     if(!h0->sps_buffers[h->pps.sps_id]) {
3757         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3758         return -1;
3759     }
3760     h->sps = *h0->sps_buffers[h->pps.sps_id];
3761
3762     if(h == h0 && h->dequant_coeff_pps != pps_id){
3763         h->dequant_coeff_pps = pps_id;
3764         init_dequant_tables(h);
3765     }
3766
3767     s->mb_width= h->sps.mb_width;
3768     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3769
3770     h->b_stride=  s->mb_width*4;
3771     h->b8_stride= s->mb_width*2;
3772
3773     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3774     if(h->sps.frame_mbs_only_flag)
3775         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3776     else
3777         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3778
3779     if (s->context_initialized
3780         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3781         if(h != h0)
3782             return -1;   // width / height changed during parallelized decoding
3783         free_tables(h);
3784         MPV_common_end(s);
3785     }
3786     if (!s->context_initialized) {
3787         if(h != h0)
3788             return -1;  // we cant (re-)initialize context during parallel decoding
3789         if (MPV_common_init(s) < 0)
3790             return -1;
3791         s->first_field = 0;
3792
3793         init_scan_tables(h);
3794         alloc_tables(h);
3795
3796         for(i = 1; i < s->avctx->thread_count; i++) {
3797             H264Context *c;
3798             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3799             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3800             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3801             c->sps = h->sps;
3802             c->pps = h->pps;
3803             init_scan_tables(c);
3804             clone_tables(c, h);
3805         }
3806
3807         for(i = 0; i < s->avctx->thread_count; i++)
3808             if(context_init(h->thread_context[i]) < 0)
3809                 return -1;
3810
3811         s->avctx->width = s->width;
3812         s->avctx->height = s->height;
3813         s->avctx->sample_aspect_ratio= h->sps.sar;
3814         if(!s->avctx->sample_aspect_ratio.den)
3815             s->avctx->sample_aspect_ratio.den = 1;
3816
3817         if(h->sps.timing_info_present_flag){
3818             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3819             if(h->x264_build > 0 && h->x264_build < 44)
3820                 s->avctx->time_base.den *= 2;
3821             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3822                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3823         }
3824     }
3825
3826     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3827
3828     h->mb_mbaff = 0;
3829     h->mb_aff_frame = 0;
3830     last_pic_structure = s0->picture_structure;
3831     if(h->sps.frame_mbs_only_flag){
3832         s->picture_structure= PICT_FRAME;
3833     }else{
3834         if(get_bits1(&s->gb)) { //field_pic_flag
3835             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3836         } else {
3837             s->picture_structure= PICT_FRAME;
3838             h->mb_aff_frame = h->sps.mb_aff;
3839         }
3840     }
3841     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3842
3843     if(h0->current_slice == 0){
3844         while(h->frame_num !=  h->prev_frame_num &&
3845               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3846             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3847             frame_start(h);
3848             h->prev_frame_num++;
3849             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3850             s->current_picture_ptr->frame_num= h->prev_frame_num;
3851             execute_ref_pic_marking(h, NULL, 0);
3852         }
3853
3854         /* See if we have a decoded first field looking for a pair... */
3855         if (s0->first_field) {
3856             assert(s0->current_picture_ptr);
3857             assert(s0->current_picture_ptr->data[0]);
3858             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3859
3860             /* figure out if we have a complementary field pair */
3861             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3862                 /*
3863                  * Previous field is unmatched. Don't display it, but let it
3864                  * remain for reference if marked as such.
3865                  */
3866                 s0->current_picture_ptr = NULL;
3867                 s0->first_field = FIELD_PICTURE;
3868
3869             } else {
3870                 if (h->nal_ref_idc &&
3871                         s0->current_picture_ptr->reference &&
3872                         s0->current_picture_ptr->frame_num != h->frame_num) {
3873                     /*
3874                      * This and previous field were reference, but had
3875                      * different frame_nums. Consider this field first in
3876                      * pair. Throw away previous field except for reference
3877                      * purposes.
3878                      */
3879                     s0->first_field = 1;
3880                     s0->current_picture_ptr = NULL;
3881
3882                 } else {
3883                     /* Second field in complementary pair */
3884                     s0->first_field = 0;
3885                 }
3886             }
3887
3888         } else {
3889             /* Frame or first field in a potentially complementary pair */
3890             assert(!s0->current_picture_ptr);
3891             s0->first_field = FIELD_PICTURE;
3892         }
3893
3894         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3895             s0->first_field = 0;
3896             return -1;
3897         }
3898     }
3899     if(h != h0)
3900         clone_slice(h, h0);
3901
3902     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3903
3904     assert(s->mb_num == s->mb_width * s->mb_height);
3905     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3906        first_mb_in_slice                    >= s->mb_num){
3907         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3908         return -1;
3909     }
3910     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3911     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3912     if (s->picture_structure == PICT_BOTTOM_FIELD)
3913         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3914     assert(s->mb_y < s->mb_height);
3915
3916     if(s->picture_structure==PICT_FRAME){
3917         h->curr_pic_num=   h->frame_num;
3918         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3919     }else{
3920         h->curr_pic_num= 2*h->frame_num + 1;
3921         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3922     }
3923
3924     if(h->nal_unit_type == NAL_IDR_SLICE){
3925         get_ue_golomb(&s->gb); /* idr_pic_id */
3926     }
3927
3928     if(h->sps.poc_type==0){
3929         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3930
3931         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3932             h->delta_poc_bottom= get_se_golomb(&s->gb);
3933         }
3934     }
3935
3936     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3937         h->delta_poc[0]= get_se_golomb(&s->gb);
3938
3939         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3940             h->delta_poc[1]= get_se_golomb(&s->gb);
3941     }
3942
3943     init_poc(h);
3944
3945     if(h->pps.redundant_pic_cnt_present){
3946         h->redundant_pic_count= get_ue_golomb(&s->gb);
3947     }
3948
3949     //set defaults, might be overridden a few lines later
3950     h->ref_count[0]= h->pps.ref_count[0];
3951     h->ref_count[1]= h->pps.ref_count[1];
3952
3953     if(h->slice_type_nos != FF_I_TYPE){
3954         if(h->slice_type_nos == FF_B_TYPE){
3955             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3956         }
3957         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3958
3959         if(num_ref_idx_active_override_flag){
3960             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3961             if(h->slice_type_nos==FF_B_TYPE)
3962                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3963
3964             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3965                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3966                 h->ref_count[0]= h->ref_count[1]= 1;
3967                 return -1;
3968             }
3969         }
3970         if(h->slice_type_nos == FF_B_TYPE)
3971             h->list_count= 2;
3972         else
3973             h->list_count= 1;
3974     }else
3975         h->list_count= 0;
3976
3977     if(!default_ref_list_done){
3978         fill_default_ref_list(h);
3979     }
3980
3981     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3982         return -1;
3983
3984     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3985        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3986         pred_weight_table(h);
3987     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3988         implicit_weight_table(h);
3989     else
3990         h->use_weight = 0;
3991
3992     if(h->nal_ref_idc)
3993         decode_ref_pic_marking(h0, &s->gb);
3994
3995     if(FRAME_MBAFF)
3996         fill_mbaff_ref_list(h);
3997
3998     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3999         tmp = get_ue_golomb(&s->gb);
4000         if(tmp > 2){
4001             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4002             return -1;
4003         }
4004         h->cabac_init_idc= tmp;
4005     }
4006
4007     h->last_qscale_diff = 0;
4008     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4009     if(tmp>51){
4010         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4011         return -1;
4012     }
4013     s->qscale= tmp;
4014     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4015     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4016     //FIXME qscale / qp ... stuff
4017     if(h->slice_type == FF_SP_TYPE){
4018         get_bits1(&s->gb); /* sp_for_switch_flag */
4019     }
4020     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4021         get_se_golomb(&s->gb); /* slice_qs_delta */
4022     }
4023
4024     h->deblocking_filter = 1;
4025     h->slice_alpha_c0_offset = 0;
4026     h->slice_beta_offset = 0;
4027     if( h->pps.deblocking_filter_parameters_present ) {
4028         tmp= get_ue_golomb(&s->gb);
4029         if(tmp > 2){
4030             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4031             return -1;
4032         }
4033         h->deblocking_filter= tmp;
4034         if(h->deblocking_filter < 2)
4035             h->deblocking_filter^= 1; // 1<->0
4036
4037         if( h->deblocking_filter ) {
4038             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4039             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4040         }
4041     }
4042
4043     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4044        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4045        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4046        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4047         h->deblocking_filter= 0;
4048
4049     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4050         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4051             /* Cheat slightly for speed:
4052                Do not bother to deblock across slices. */
4053             h->deblocking_filter = 2;
4054         } else {
4055             h0->max_contexts = 1;
4056             if(!h0->single_decode_warning) {
4057                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4058                 h0->single_decode_warning = 1;
4059             }
4060             if(h != h0)
4061                 return 1; // deblocking switched inside frame
4062         }
4063     }
4064
4065 #if 0 //FMO
4066     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4067         slice_group_change_cycle= get_bits(&s->gb, ?);
4068 #endif
4069
4070     h0->last_slice_type = slice_type;
4071     h->slice_num = ++h0->current_slice;
4072
4073     for(j=0; j<2; j++){
4074         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4075         ref2frm[0]=
4076         ref2frm[1]= -1;
4077         for(i=0; i<48; i++)
4078             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4079                           +(h->ref_list[j][i].reference&3);
4080     }
4081
4082     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4083     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4084
4085     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4086         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4087                h->slice_num,
4088                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4089                first_mb_in_slice,
4090                av_get_pict_type_char(h->slice_type),
4091                pps_id, h->frame_num,
4092                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4093                h->ref_count[0], h->ref_count[1],
4094                s->qscale,
4095                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4096                h->use_weight,
4097                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4098                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4099                );
4100     }
4101
4102     return 0;
4103 }
4104
4105 /**
4106  *
4107  */
4108 static inline int get_level_prefix(GetBitContext *gb){
4109     unsigned int buf;
4110     int log;
4111
4112     OPEN_READER(re, gb);
4113     UPDATE_CACHE(re, gb);
4114     buf=GET_CACHE(re, gb);
4115
4116     log= 32 - av_log2(buf);
4117 #ifdef TRACE
4118     print_bin(buf>>(32-log), log);
4119     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4120 #endif
4121
4122     LAST_SKIP_BITS(re, gb, log);
4123     CLOSE_READER(re, gb);
4124
4125     return log-1;
4126 }
4127
4128 static inline int get_dct8x8_allowed(H264Context *h){
4129     int i;
4130     for(i=0; i<4; i++){
4131         if(!IS_SUB_8X8(h->sub_mb_type[i])
4132            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4133             return 0;
4134     }
4135     return 1;
4136 }
4137
4138 /**
4139  * decodes a residual block.
4140  * @param n block index
4141  * @param scantable scantable
4142  * @param max_coeff number of coefficients in the block
4143  * @return <0 if an error occurred
4144  */
4145 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4146     MpegEncContext * const s = &h->s;
4147     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4148     int level[16];
4149     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4150
4151     //FIXME put trailing_onex into the context
4152
4153     if(n == CHROMA_DC_BLOCK_INDEX){
4154         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4155         total_coeff= coeff_token>>2;
4156     }else{
4157         if(n == LUMA_DC_BLOCK_INDEX){
4158             total_coeff= pred_non_zero_count(h, 0);
4159             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4160             total_coeff= coeff_token>>2;
4161         }else{
4162             total_coeff= pred_non_zero_count(h, n);
4163             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4164             total_coeff= coeff_token>>2;
4165             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4166         }
4167     }
4168
4169     //FIXME set last_non_zero?
4170
4171     if(total_coeff==0)
4172         return 0;
4173     if(total_coeff > (unsigned)max_coeff) {
4174         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4175         return -1;
4176     }
4177
4178     trailing_ones= coeff_token&3;
4179     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4180     assert(total_coeff<=16);
4181
4182     for(i=0; i<trailing_ones; i++){
4183         level[i]= 1 - 2*get_bits1(gb);
4184     }
4185
4186     if(i<total_coeff) {
4187         int level_code, mask;
4188         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4189         int prefix= get_level_prefix(gb);
4190
4191         //first coefficient has suffix_length equal to 0 or 1
4192         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4193             if(suffix_length)
4194                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4195             else
4196                 level_code= (prefix<<suffix_length); //part
4197         }else if(prefix==14){
4198             if(suffix_length)
4199                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4200             else
4201                 level_code= prefix + get_bits(gb, 4); //part
4202         }else{
4203             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4204             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4205             if(prefix>=16)
4206                 level_code += (1<<(prefix-3))-4096;
4207         }
4208
4209         if(trailing_ones < 3) level_code += 2;
4210
4211         suffix_length = 1;
4212         if(level_code > 5)
4213             suffix_length++;
4214         mask= -(level_code&1);
4215         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4216         i++;
4217
4218         //remaining coefficients have suffix_length > 0
4219         for(;i<total_coeff;i++) {
4220             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4221             prefix = get_level_prefix(gb);
4222             if(prefix<15){
4223                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4224             }else{
4225                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4226                 if(prefix>=16)
4227                     level_code += (1<<(prefix-3))-4096;
4228             }
4229             mask= -(level_code&1);
4230             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4231             if(level_code > suffix_limit[suffix_length])
4232                 suffix_length++;
4233         }
4234     }
4235
4236     if(total_coeff == max_coeff)
4237         zeros_left=0;
4238     else{
4239         if(n == CHROMA_DC_BLOCK_INDEX)
4240             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4241         else
4242             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4243     }
4244
4245     coeff_num = zeros_left + total_coeff - 1;
4246     j = scantable[coeff_num];
4247     if(n > 24){
4248         block[j] = level[0];
4249         for(i=1;i<total_coeff;i++) {
4250             if(zeros_left <= 0)
4251                 run_before = 0;
4252             else if(zeros_left < 7){
4253                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4254             }else{
4255                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4256             }
4257             zeros_left -= run_before;
4258             coeff_num -= 1 + run_before;
4259             j= scantable[ coeff_num ];
4260
4261             block[j]= level[i];
4262         }
4263     }else{
4264         block[j] = (level[0] * qmul[j] + 32)>>6;
4265         for(i=1;i<total_coeff;i++) {
4266             if(zeros_left <= 0)
4267                 run_before = 0;
4268             else if(zeros_left < 7){
4269                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4270             }else{
4271                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4272             }
4273             zeros_left -= run_before;
4274             coeff_num -= 1 + run_before;
4275             j= scantable[ coeff_num ];
4276
4277             block[j]= (level[i] * qmul[j] + 32)>>6;
4278         }
4279     }
4280
4281     if(zeros_left<0){
4282         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4283         return -1;
4284     }
4285
4286     return 0;
4287 }
4288
4289 static void predict_field_decoding_flag(H264Context *h){
4290     MpegEncContext * const s = &h->s;
4291     const int mb_xy= h->mb_xy;
4292     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4293                 ? s->current_picture.mb_type[mb_xy-1]
4294                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4295                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4296                 : 0;
4297     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4298 }
4299
4300 /**
4301  * decodes a P_SKIP or B_SKIP macroblock
4302  */
4303 static void decode_mb_skip(H264Context *h){
4304     MpegEncContext * const s = &h->s;
4305     const int mb_xy= h->mb_xy;
4306     int mb_type=0;
4307
4308     memset(h->non_zero_count[mb_xy], 0, 16);
4309     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4310
4311     if(MB_FIELD)
4312         mb_type|= MB_TYPE_INTERLACED;
4313
4314     if( h->slice_type_nos == FF_B_TYPE )
4315     {
4316         // just for fill_caches. pred_direct_motion will set the real mb_type
4317         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4318
4319         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4320         pred_direct_motion(h, &mb_type);
4321         mb_type|= MB_TYPE_SKIP;
4322     }
4323     else
4324     {
4325         int mx, my;
4326         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4327
4328         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4329         pred_pskip_motion(h, &mx, &my);
4330         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4331         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4332     }
4333
4334     write_back_motion(h, mb_type);
4335     s->current_picture.mb_type[mb_xy]= mb_type;
4336     s->current_picture.qscale_table[mb_xy]= s->qscale;
4337     h->slice_table[ mb_xy ]= h->slice_num;
4338     h->prev_mb_skipped= 1;
4339 }
4340
4341 /**
4342  * decodes a macroblock
4343  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4344  */
4345 static int decode_mb_cavlc(H264Context *h){
4346     MpegEncContext * const s = &h->s;
4347     int mb_xy;
4348     int partition_count;
4349     unsigned int mb_type, cbp;
4350     int dct8x8_allowed= h->pps.transform_8x8_mode;
4351
4352     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4353
4354     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4355
4356     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4357     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4358                 down the code */
4359     if(h->slice_type_nos != FF_I_TYPE){
4360         if(s->mb_skip_run==-1)
4361             s->mb_skip_run= get_ue_golomb(&s->gb);
4362
4363         if (s->mb_skip_run--) {
4364             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4365                 if(s->mb_skip_run==0)
4366                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4367                 else
4368                     predict_field_decoding_flag(h);
4369             }
4370             decode_mb_skip(h);
4371             return 0;
4372         }
4373     }
4374     if(FRAME_MBAFF){
4375         if( (s->mb_y&1) == 0 )
4376             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4377     }
4378
4379     h->prev_mb_skipped= 0;
4380
4381     mb_type= get_ue_golomb(&s->gb);
4382     if(h->slice_type_nos == FF_B_TYPE){
4383         if(mb_type < 23){
4384             partition_count= b_mb_type_info[mb_type].partition_count;
4385             mb_type=         b_mb_type_info[mb_type].type;
4386         }else{
4387             mb_type -= 23;
4388             goto decode_intra_mb;
4389         }
4390     }else if(h->slice_type_nos == FF_P_TYPE){
4391         if(mb_type < 5){
4392             partition_count= p_mb_type_info[mb_type].partition_count;
4393             mb_type=         p_mb_type_info[mb_type].type;
4394         }else{
4395             mb_type -= 5;
4396             goto decode_intra_mb;
4397         }
4398     }else{
4399        assert(h->slice_type_nos == FF_I_TYPE);
4400         if(h->slice_type == FF_SI_TYPE && mb_type)
4401             mb_type--;
4402 decode_intra_mb:
4403         if(mb_type > 25){
4404             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4405             return -1;
4406         }
4407         partition_count=0;
4408         cbp= i_mb_type_info[mb_type].cbp;
4409         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4410         mb_type= i_mb_type_info[mb_type].type;
4411     }
4412
4413     if(MB_FIELD)
4414         mb_type |= MB_TYPE_INTERLACED;
4415
4416     h->slice_table[ mb_xy ]= h->slice_num;
4417
4418     if(IS_INTRA_PCM(mb_type)){
4419         unsigned int x;
4420
4421         // We assume these blocks are very rare so we do not optimize it.
4422         align_get_bits(&s->gb);
4423
4424         // The pixels are stored in the same order as levels in h->mb array.
4425         for(x=0; x < (CHROMA ? 384 : 256); x++){
4426             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4427         }
4428
4429         // In deblocking, the quantizer is 0
4430         s->current_picture.qscale_table[mb_xy]= 0;
4431         // All coeffs are present
4432         memset(h->non_zero_count[mb_xy], 16, 16);
4433
4434         s->current_picture.mb_type[mb_xy]= mb_type;
4435         return 0;
4436     }
4437
4438     if(MB_MBAFF){
4439         h->ref_count[0] <<= 1;
4440         h->ref_count[1] <<= 1;
4441     }
4442
4443     fill_caches(h, mb_type, 0);
4444
4445     //mb_pred
4446     if(IS_INTRA(mb_type)){
4447         int pred_mode;
4448 //            init_top_left_availability(h);
4449         if(IS_INTRA4x4(mb_type)){
4450             int i;
4451             int di = 1;
4452             if(dct8x8_allowed && get_bits1(&s->gb)){
4453                 mb_type |= MB_TYPE_8x8DCT;
4454                 di = 4;
4455             }
4456
4457 //                fill_intra4x4_pred_table(h);
4458             for(i=0; i<16; i+=di){
4459                 int mode= pred_intra_mode(h, i);
4460
4461                 if(!get_bits1(&s->gb)){
4462                     const int rem_mode= get_bits(&s->gb, 3);
4463                     mode = rem_mode + (rem_mode >= mode);
4464                 }
4465
4466                 if(di==4)
4467                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4468                 else
4469                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4470             }
4471             write_back_intra_pred_mode(h);
4472             if( check_intra4x4_pred_mode(h) < 0)
4473                 return -1;
4474         }else{
4475             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4476             if(h->intra16x16_pred_mode < 0)
4477                 return -1;
4478         }
4479         if(CHROMA){
4480             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4481             if(pred_mode < 0)
4482                 return -1;
4483             h->chroma_pred_mode= pred_mode;
4484         }
4485     }else if(partition_count==4){
4486         int i, j, sub_partition_count[4], list, ref[2][4];
4487
4488         if(h->slice_type_nos == FF_B_TYPE){
4489             for(i=0; i<4; i++){
4490                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4491                 if(h->sub_mb_type[i] >=13){
4492                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4493                     return -1;
4494                 }
4495                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4496                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4497             }
4498             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4499                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4500                 pred_direct_motion(h, &mb_type);
4501                 h->ref_cache[0][scan8[4]] =
4502                 h->ref_cache[1][scan8[4]] =
4503                 h->ref_cache[0][scan8[12]] =
4504                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4505             }
4506         }else{
4507             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4508             for(i=0; i<4; i++){
4509                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4510                 if(h->sub_mb_type[i] >=4){
4511                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4512                     return -1;
4513                 }
4514                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4515                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4516             }
4517         }
4518
4519         for(list=0; list<h->list_count; list++){
4520             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4521             for(i=0; i<4; i++){
4522                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4523                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4524                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4525                     if(tmp>=ref_count){
4526                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4527                         return -1;
4528                     }
4529                     ref[list][i]= tmp;
4530                 }else{
4531                  //FIXME
4532                     ref[list][i] = -1;
4533                 }
4534             }
4535         }
4536
4537         if(dct8x8_allowed)
4538             dct8x8_allowed = get_dct8x8_allowed(h);
4539
4540         for(list=0; list<h->list_count; list++){
4541             for(i=0; i<4; i++){
4542                 if(IS_DIRECT(h->sub_mb_type[i])) {
4543                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4544                     continue;
4545                 }
4546                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4547                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4548
4549                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4550                     const int sub_mb_type= h->sub_mb_type[i];
4551                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4552                     for(j=0; j<sub_partition_count[i]; j++){
4553                         int mx, my;
4554                         const int index= 4*i + block_width*j;
4555                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4556                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4557                         mx += get_se_golomb(&s->gb);
4558                         my += get_se_golomb(&s->gb);
4559                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4560
4561                         if(IS_SUB_8X8(sub_mb_type)){
4562                             mv_cache[ 1 ][0]=
4563                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4564                             mv_cache[ 1 ][1]=
4565                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4566                         }else if(IS_SUB_8X4(sub_mb_type)){
4567                             mv_cache[ 1 ][0]= mx;
4568                             mv_cache[ 1 ][1]= my;
4569                         }else if(IS_SUB_4X8(sub_mb_type)){
4570                             mv_cache[ 8 ][0]= mx;
4571                             mv_cache[ 8 ][1]= my;
4572                         }
4573                         mv_cache[ 0 ][0]= mx;
4574                         mv_cache[ 0 ][1]= my;
4575                     }
4576                 }else{
4577                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4578                     p[0] = p[1]=
4579                     p[8] = p[9]= 0;
4580                 }
4581             }
4582         }
4583     }else if(IS_DIRECT(mb_type)){
4584         pred_direct_motion(h, &mb_type);
4585         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4586     }else{
4587         int list, mx, my, i;
4588          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4589         if(IS_16X16(mb_type)){
4590             for(list=0; list<h->list_count; list++){
4591                     unsigned int val;
4592                     if(IS_DIR(mb_type, 0, list)){
4593                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4594                         if(val >= h->ref_count[list]){
4595                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4596                             return -1;
4597                         }
4598                     }else
4599                         val= LIST_NOT_USED&0xFF;
4600                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4601             }
4602             for(list=0; list<h->list_count; list++){
4603                 unsigned int val;
4604                 if(IS_DIR(mb_type, 0, list)){
4605                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4606                     mx += get_se_golomb(&s->gb);
4607                     my += get_se_golomb(&s->gb);
4608                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4609
4610                     val= pack16to32(mx,my);
4611                 }else
4612                     val=0;
4613                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4614             }
4615         }
4616         else if(IS_16X8(mb_type)){
4617             for(list=0; list<h->list_count; list++){
4618                     for(i=0; i<2; i++){
4619                         unsigned int val;
4620                         if(IS_DIR(mb_type, i, list)){
4621                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4622                             if(val >= h->ref_count[list]){
4623                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4624                                 return -1;
4625                             }
4626                         }else
4627                             val= LIST_NOT_USED&0xFF;
4628                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4629                     }
4630             }
4631             for(list=0; list<h->list_count; list++){
4632                 for(i=0; i<2; i++){
4633                     unsigned int val;
4634                     if(IS_DIR(mb_type, i, list)){
4635                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4636                         mx += get_se_golomb(&s->gb);
4637                         my += get_se_golomb(&s->gb);
4638                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4639
4640                         val= pack16to32(mx,my);
4641                     }else
4642                         val=0;
4643                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4644                 }
4645             }
4646         }else{
4647             assert(IS_8X16(mb_type));
4648             for(list=0; list<h->list_count; list++){
4649                     for(i=0; i<2; i++){
4650                         unsigned int val;
4651                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4652                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4653                             if(val >= h->ref_count[list]){
4654                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4655                                 return -1;
4656                             }
4657                         }else
4658                             val= LIST_NOT_USED&0xFF;
4659                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4660                     }
4661             }
4662             for(list=0; list<h->list_count; list++){
4663                 for(i=0; i<2; i++){
4664                     unsigned int val;
4665                     if(IS_DIR(mb_type, i, list)){
4666                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4667                         mx += get_se_golomb(&s->gb);
4668                         my += get_se_golomb(&s->gb);
4669                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4670
4671                         val= pack16to32(mx,my);
4672                     }else
4673                         val=0;
4674                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4675                 }
4676             }
4677         }
4678     }
4679
4680     if(IS_INTER(mb_type))
4681         write_back_motion(h, mb_type);
4682
4683     if(!IS_INTRA16x16(mb_type)){
4684         cbp= get_ue_golomb(&s->gb);
4685         if(cbp > 47){
4686             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4687             return -1;
4688         }
4689
4690         if(CHROMA){
4691             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4692             else                     cbp= golomb_to_inter_cbp   [cbp];
4693         }else{
4694             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4695             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4696         }
4697     }
4698     h->cbp = cbp;
4699
4700     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4701         if(get_bits1(&s->gb)){
4702             mb_type |= MB_TYPE_8x8DCT;
4703             h->cbp_table[mb_xy]= cbp;
4704         }
4705     }
4706     s->current_picture.mb_type[mb_xy]= mb_type;
4707
4708     if(cbp || IS_INTRA16x16(mb_type)){
4709         int i8x8, i4x4, chroma_idx;
4710         int dquant;
4711         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4712         const uint8_t *scan, *scan8x8, *dc_scan;
4713
4714 //        fill_non_zero_count_cache(h);
4715
4716         if(IS_INTERLACED(mb_type)){
4717             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4718             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4719             dc_scan= luma_dc_field_scan;
4720         }else{
4721             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4722             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4723             dc_scan= luma_dc_zigzag_scan;
4724         }
4725
4726         dquant= get_se_golomb(&s->gb);
4727
4728         if( dquant > 25 || dquant < -26 ){
4729             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4730             return -1;
4731         }
4732
4733         s->qscale += dquant;
4734         if(((unsigned)s->qscale) > 51){
4735             if(s->qscale<0) s->qscale+= 52;
4736             else            s->qscale-= 52;
4737         }
4738
4739         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4740         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4741         if(IS_INTRA16x16(mb_type)){
4742             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4743                 return -1; //FIXME continue if partitioned and other return -1 too
4744             }
4745
4746             assert((cbp&15) == 0 || (cbp&15) == 15);
4747
4748             if(cbp&15){
4749                 for(i8x8=0; i8x8<4; i8x8++){
4750                     for(i4x4=0; i4x4<4; i4x4++){
4751                         const int index= i4x4 + 4*i8x8;
4752                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4753                             return -1;
4754                         }
4755                     }
4756                 }
4757             }else{
4758                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4759             }
4760         }else{
4761             for(i8x8=0; i8x8<4; i8x8++){
4762                 if(cbp & (1<<i8x8)){
4763                     if(IS_8x8DCT(mb_type)){
4764                         DCTELEM *buf = &h->mb[64*i8x8];
4765                         uint8_t *nnz;
4766                         for(i4x4=0; i4x4<4; i4x4++){
4767                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4768                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4769                                 return -1;
4770                         }
4771                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4772                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4773                     }else{
4774                         for(i4x4=0; i4x4<4; i4x4++){
4775                             const int index= i4x4 + 4*i8x8;
4776
4777                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4778                                 return -1;
4779                             }
4780                         }
4781                     }
4782                 }else{
4783                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4784                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4785                 }
4786             }
4787         }
4788
4789         if(cbp&0x30){
4790             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4791                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4792                     return -1;
4793                 }
4794         }
4795
4796         if(cbp&0x20){
4797             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4798                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4799                 for(i4x4=0; i4x4<4; i4x4++){
4800                     const int index= 16 + 4*chroma_idx + i4x4;
4801                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4802                         return -1;
4803                     }
4804                 }
4805             }
4806         }else{
4807             uint8_t * const nnz= &h->non_zero_count_cache[0];
4808             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4809             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4810         }
4811     }else{
4812         uint8_t * const nnz= &h->non_zero_count_cache[0];
4813         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4814         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4815         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4816     }
4817     s->current_picture.qscale_table[mb_xy]= s->qscale;
4818     write_back_non_zero_count(h);
4819
4820     if(MB_MBAFF){
4821         h->ref_count[0] >>= 1;
4822         h->ref_count[1] >>= 1;
4823     }
4824
4825     return 0;
4826 }
4827
4828 static int decode_cabac_field_decoding_flag(H264Context *h) {
4829     MpegEncContext * const s = &h->s;
4830     const int mb_x = s->mb_x;
4831     const int mb_y = s->mb_y & ~1;
4832     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4833     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4834
4835     unsigned int ctx = 0;
4836
4837     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4838         ctx += 1;
4839     }
4840     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4841         ctx += 1;
4842     }
4843
4844     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4845 }
4846
4847 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4848     uint8_t *state= &h->cabac_state[ctx_base];
4849     int mb_type;
4850
4851     if(intra_slice){
4852         MpegEncContext * const s = &h->s;
4853         const int mba_xy = h->left_mb_xy[0];
4854         const int mbb_xy = h->top_mb_xy;
4855         int ctx=0;
4856         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4857             ctx++;
4858         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4859             ctx++;
4860         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4861             return 0;   /* I4x4 */
4862         state += 2;
4863     }else{
4864         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4865             return 0;   /* I4x4 */
4866     }
4867
4868     if( get_cabac_terminate( &h->cabac ) )
4869         return 25;  /* PCM */
4870
4871     mb_type = 1; /* I16x16 */
4872     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4873     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4874         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4875     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4876     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4877     return mb_type;
4878 }
4879
4880 static int decode_cabac_mb_type( H264Context *h ) {
4881     MpegEncContext * const s = &h->s;
4882
4883     if( h->slice_type_nos == FF_I_TYPE ) {
4884         return decode_cabac_intra_mb_type(h, 3, 1);
4885     } else if( h->slice_type_nos == FF_P_TYPE ) {
4886         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4887             /* P-type */
4888             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4889                 /* P_L0_D16x16, P_8x8 */
4890                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4891             } else {
4892                 /* P_L0_D8x16, P_L0_D16x8 */
4893                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4894             }
4895         } else {
4896             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4897         }
4898     } else if( h->slice_type_nos == FF_B_TYPE ) {
4899         const int mba_xy = h->left_mb_xy[0];
4900         const int mbb_xy = h->top_mb_xy;
4901         int ctx = 0;
4902         int bits;
4903
4904         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4905             ctx++;
4906         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4907             ctx++;
4908
4909         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4910             return 0; /* B_Direct_16x16 */
4911
4912         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4913             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4914         }
4915
4916         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4917         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4918         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4919         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4920         if( bits < 8 )
4921             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4922         else if( bits == 13 ) {
4923             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4924         } else if( bits == 14 )
4925             return 11; /* B_L1_L0_8x16 */
4926         else if( bits == 15 )
4927             return 22; /* B_8x8 */
4928
4929         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4930         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4931     } else {
4932         /* TODO SI/SP frames? */
4933         return -1;
4934     }
4935 }
4936
4937 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4938     MpegEncContext * const s = &h->s;
4939     int mba_xy, mbb_xy;
4940     int ctx = 0;
4941
4942     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4943         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4944         mba_xy = mb_xy - 1;
4945         if( (mb_y&1)
4946             && h->slice_table[mba_xy] == h->slice_num
4947             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4948             mba_xy += s->mb_stride;
4949         if( MB_FIELD ){
4950             mbb_xy = mb_xy - s->mb_stride;
4951             if( !(mb_y&1)
4952                 && h->slice_table[mbb_xy] == h->slice_num
4953                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4954                 mbb_xy -= s->mb_stride;
4955         }else
4956             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4957     }else{
4958         int mb_xy = h->mb_xy;
4959         mba_xy = mb_xy - 1;
4960         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4961     }
4962
4963     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4964         ctx++;
4965     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4966         ctx++;
4967
4968     if( h->slice_type_nos == FF_B_TYPE )
4969         ctx += 13;
4970     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4971 }
4972
4973 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4974     int mode = 0;
4975
4976     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4977         return pred_mode;
4978
4979     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4980     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4981     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4982
4983     if( mode >= pred_mode )
4984         return mode + 1;
4985     else
4986         return mode;
4987 }
4988
4989 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4990     const int mba_xy = h->left_mb_xy[0];
4991     const int mbb_xy = h->top_mb_xy;
4992
4993     int ctx = 0;
4994
4995     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4996     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4997         ctx++;
4998
4999     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5000         ctx++;
5001
5002     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5003         return 0;
5004
5005     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5006         return 1;
5007     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5008         return 2;
5009     else
5010         return 3;
5011 }
5012
5013 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5014     int cbp_b, cbp_a, ctx, cbp = 0;
5015
5016     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5017     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5018
5019     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5020     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5021     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5022     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5023     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5024     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5025     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5026     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5027     return cbp;
5028 }
5029 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5030     int ctx;
5031     int cbp_a, cbp_b;
5032
5033     cbp_a = (h->left_cbp>>4)&0x03;
5034     cbp_b = (h-> top_cbp>>4)&0x03;
5035
5036     ctx = 0;
5037     if( cbp_a > 0 ) ctx++;
5038     if( cbp_b > 0 ) ctx += 2;
5039     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5040         return 0;
5041
5042     ctx = 4;
5043     if( cbp_a == 2 ) ctx++;
5044     if( cbp_b == 2 ) ctx += 2;
5045     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5046 }
5047 static int decode_cabac_mb_dqp( H264Context *h) {
5048     int   ctx = 0;
5049     int   val = 0;
5050
5051     if( h->last_qscale_diff != 0 )
5052         ctx++;
5053
5054     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5055         if( ctx < 2 )
5056             ctx = 2;
5057         else
5058             ctx = 3;
5059         val++;
5060         if(val > 102) //prevent infinite loop
5061             return INT_MIN;
5062     }
5063
5064     if( val&0x01 )
5065         return (val + 1)/2;
5066     else
5067         return -(val + 1)/2;
5068 }
5069 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5070     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5071         return 0;   /* 8x8 */
5072     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5073         return 1;   /* 8x4 */
5074     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5075         return 2;   /* 4x8 */
5076     return 3;       /* 4x4 */
5077 }
5078 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5079     int type;
5080     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5081         return 0;   /* B_Direct_8x8 */
5082     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5083         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5084     type = 3;
5085     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5086         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5087             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5088         type += 4;
5089     }
5090     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5091     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5092     return type;
5093 }
5094
5095 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5096     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5097 }
5098
5099 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5100     int refa = h->ref_cache[list][scan8[n] - 1];
5101     int refb = h->ref_cache[list][scan8[n] - 8];
5102     int ref  = 0;
5103     int ctx  = 0;
5104
5105     if( h->slice_type_nos == FF_B_TYPE) {
5106         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5107             ctx++;
5108         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5109             ctx += 2;
5110     } else {
5111         if( refa > 0 )
5112             ctx++;
5113         if( refb > 0 )
5114             ctx += 2;
5115     }
5116
5117     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5118         ref++;
5119         if( ctx < 4 )
5120             ctx = 4;
5121         else
5122             ctx = 5;
5123         if(ref >= 32 /*h->ref_list[list]*/){
5124             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5125             return 0; //FIXME we should return -1 and check the return everywhere
5126         }
5127     }
5128     return ref;
5129 }
5130
5131 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5132     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5133                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5134     int ctxbase = (l == 0) ? 40 : 47;
5135     int ctx, mvd;
5136
5137     if( amvd < 3 )
5138         ctx = 0;
5139     else if( amvd > 32 )
5140         ctx = 2;
5141     else
5142         ctx = 1;
5143
5144     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5145         return 0;
5146
5147     mvd= 1;
5148     ctx= 3;
5149     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5150         mvd++;
5151         if( ctx < 6 )
5152             ctx++;
5153     }
5154
5155     if( mvd >= 9 ) {
5156         int k = 3;
5157         while( get_cabac_bypass( &h->cabac ) ) {
5158             mvd += 1 << k;
5159             k++;
5160             if(k>24){
5161                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5162                 return INT_MIN;
5163             }
5164         }
5165         while( k-- ) {
5166             if( get_cabac_bypass( &h->cabac ) )
5167                 mvd += 1 << k;
5168         }
5169     }
5170     return get_cabac_bypass_sign( &h->cabac, -mvd );
5171 }
5172
5173 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5174     int nza, nzb;
5175     int ctx = 0;
5176
5177     if( is_dc ) {
5178         if( cat == 0 ) {
5179             nza = h->left_cbp&0x100;
5180             nzb = h-> top_cbp&0x100;
5181         } else {
5182             nza = (h->left_cbp>>(6+idx))&0x01;
5183             nzb = (h-> top_cbp>>(6+idx))&0x01;
5184         }
5185     } else {
5186         if( cat == 4 ) {
5187             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5188             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5189         } else {
5190             assert(cat == 1 || cat == 2);
5191             nza = h->non_zero_count_cache[scan8[idx] - 1];
5192             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5193         }
5194     }
5195
5196     if( nza > 0 )
5197         ctx++;
5198
5199     if( nzb > 0 )
5200         ctx += 2;
5201
5202     return ctx + 4 * cat;
5203 }
5204
5205 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5206     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5207     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5208     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5209     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5210 };
5211
5212 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5213     static const int significant_coeff_flag_offset[2][6] = {
5214       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5215       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5216     };
5217     static const int last_coeff_flag_offset[2][6] = {
5218       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5219       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5220     };
5221     static const int coeff_abs_level_m1_offset[6] = {
5222         227+0, 227+10, 227+20, 227+30, 227+39, 426
5223     };
5224     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5225       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5226         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5227         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5228        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5229       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5230         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5231         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5232         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5233     };
5234     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5235      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5236      * map node ctx => cabac ctx for level=1 */
5237     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5238     /* map node ctx => cabac ctx for level>1 */
5239     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5240     static const uint8_t coeff_abs_level_transition[2][8] = {
5241     /* update node ctx after decoding a level=1 */
5242         { 1, 2, 3, 3, 4, 5, 6, 7 },
5243     /* update node ctx after decoding a level>1 */
5244         { 4, 4, 4, 4, 5, 6, 7, 7 }
5245     };
5246
5247     int index[64];
5248
5249     int av_unused last;
5250     int coeff_count = 0;
5251     int node_ctx = 0;
5252
5253     uint8_t *significant_coeff_ctx_base;
5254     uint8_t *last_coeff_ctx_base;
5255     uint8_t *abs_level_m1_ctx_base;
5256
5257 #ifndef ARCH_X86
5258 #define CABAC_ON_STACK
5259 #endif
5260 #ifdef CABAC_ON_STACK
5261 #define CC &cc
5262     CABACContext cc;
5263     cc.range     = h->cabac.range;
5264     cc.low       = h->cabac.low;
5265     cc.bytestream= h->cabac.bytestream;
5266 #else
5267 #define CC &h->cabac
5268 #endif
5269
5270
5271     /* cat: 0-> DC 16x16  n = 0
5272      *      1-> AC 16x16  n = luma4x4idx
5273      *      2-> Luma4x4   n = luma4x4idx
5274      *      3-> DC Chroma n = iCbCr
5275      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5276      *      5-> Luma8x8   n = 4 * luma8x8idx
5277      */
5278
5279     /* read coded block flag */
5280     if( is_dc || cat != 5 ) {
5281         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5282             if( !is_dc ) {
5283                 if( cat == 4 )
5284                     h->non_zero_count_cache[scan8[16+n]] = 0;
5285                 else
5286                     h->non_zero_count_cache[scan8[n]] = 0;
5287             }
5288
5289 #ifdef CABAC_ON_STACK
5290             h->cabac.range     = cc.range     ;
5291             h->cabac.low       = cc.low       ;
5292             h->cabac.bytestream= cc.bytestream;
5293 #endif
5294             return;
5295         }
5296     }
5297
5298     significant_coeff_ctx_base = h->cabac_state
5299         + significant_coeff_flag_offset[MB_FIELD][cat];
5300     last_coeff_ctx_base = h->cabac_state
5301         + last_coeff_flag_offset[MB_FIELD][cat];
5302     abs_level_m1_ctx_base = h->cabac_state
5303         + coeff_abs_level_m1_offset[cat];
5304
5305     if( !is_dc && cat == 5 ) {
5306 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5307         for(last= 0; last < coefs; last++) { \
5308             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5309             if( get_cabac( CC, sig_ctx )) { \
5310                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5311                 index[coeff_count++] = last; \
5312                 if( get_cabac( CC, last_ctx ) ) { \
5313                     last= max_coeff; \
5314                     break; \
5315                 } \
5316             } \
5317         }\
5318         if( last == max_coeff -1 ) {\
5319             index[coeff_count++] = last;\
5320         }
5321         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5322 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5323         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5324     } else {
5325         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5326 #else
5327         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5328     } else {
5329         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5330 #endif
5331     }
5332     assert(coeff_count > 0);
5333
5334     if( is_dc ) {
5335         if( cat == 0 )
5336             h->cbp_table[h->mb_xy] |= 0x100;
5337         else
5338             h->cbp_table[h->mb_xy] |= 0x40 << n;
5339     } else {
5340         if( cat == 5 )
5341             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5342         else if( cat == 4 )
5343             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5344         else {
5345             assert( cat == 1 || cat == 2 );
5346             h->non_zero_count_cache[scan8[n]] = coeff_count;
5347         }
5348     }
5349
5350     do {
5351         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5352
5353         int j= scantable[index[--coeff_count]];
5354
5355         if( get_cabac( CC, ctx ) == 0 ) {
5356             node_ctx = coeff_abs_level_transition[0][node_ctx];
5357             if( is_dc ) {
5358                 block[j] = get_cabac_bypass_sign( CC, -1);
5359             }else{
5360                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5361             }
5362         } else {
5363             int coeff_abs = 2;
5364             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5365             node_ctx = coeff_abs_level_transition[1][node_ctx];
5366
5367             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5368                 coeff_abs++;
5369             }
5370
5371             if( coeff_abs >= 15 ) {
5372                 int j = 0;
5373                 while( get_cabac_bypass( CC ) ) {
5374                     j++;
5375                 }
5376
5377                 coeff_abs=1;
5378                 while( j-- ) {
5379                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5380                 }
5381                 coeff_abs+= 14;
5382             }
5383
5384             if( is_dc ) {
5385                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5386             }else{
5387                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5388             }
5389         }
5390     } while( coeff_count );
5391 #ifdef CABAC_ON_STACK
5392             h->cabac.range     = cc.range     ;
5393             h->cabac.low       = cc.low       ;
5394             h->cabac.bytestream= cc.bytestream;
5395 #endif
5396
5397 }
5398
5399 #ifndef CONFIG_SMALL
5400 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5401     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5402 }
5403
5404 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5405     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5406 }
5407 #endif
5408
5409 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5410 #ifdef CONFIG_SMALL
5411     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5412 #else
5413     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5414     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5415 #endif
5416 }
5417
5418 static inline void compute_mb_neighbors(H264Context *h)
5419 {
5420     MpegEncContext * const s = &h->s;
5421     const int mb_xy  = h->mb_xy;
5422     h->top_mb_xy     = mb_xy - s->mb_stride;
5423     h->left_mb_xy[0] = mb_xy - 1;
5424     if(FRAME_MBAFF){
5425         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5426         const int top_pair_xy      = pair_xy     - s->mb_stride;
5427         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5428         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5429         const int curr_mb_frame_flag = !MB_FIELD;
5430         const int bottom = (s->mb_y & 1);
5431         if (bottom
5432                 ? !curr_mb_frame_flag // bottom macroblock
5433                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5434                 ) {
5435             h->top_mb_xy -= s->mb_stride;
5436         }
5437         if (left_mb_frame_flag != curr_mb_frame_flag) {
5438             h->left_mb_xy[0] = pair_xy - 1;
5439         }
5440     } else if (FIELD_PICTURE) {
5441         h->top_mb_xy -= s->mb_stride;
5442     }
5443     return;
5444 }
5445
5446 /**
5447  * decodes a macroblock
5448  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5449  */
5450 static int decode_mb_cabac(H264Context *h) {
5451     MpegEncContext * const s = &h->s;
5452     int mb_xy;
5453     int mb_type, partition_count, cbp = 0;
5454     int dct8x8_allowed= h->pps.transform_8x8_mode;
5455
5456     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5457
5458     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5459
5460     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5461     if( h->slice_type_nos != FF_I_TYPE ) {
5462         int skip;
5463         /* a skipped mb needs the aff flag from the following mb */
5464         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5465             predict_field_decoding_flag(h);
5466         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5467             skip = h->next_mb_skipped;
5468         else
5469             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5470         /* read skip flags */
5471         if( skip ) {
5472             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5473                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5474                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5475                 if(h->next_mb_skipped)
5476                     predict_field_decoding_flag(h);
5477                 else
5478                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5479             }
5480
5481             decode_mb_skip(h);
5482
5483             h->cbp_table[mb_xy] = 0;
5484             h->chroma_pred_mode_table[mb_xy] = 0;
5485             h->last_qscale_diff = 0;
5486
5487             return 0;
5488
5489         }
5490     }
5491     if(FRAME_MBAFF){
5492         if( (s->mb_y&1) == 0 )
5493             h->mb_mbaff =
5494             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5495     }
5496
5497     h->prev_mb_skipped = 0;
5498
5499     compute_mb_neighbors(h);
5500     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5501         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5502         return -1;
5503     }
5504
5505     if( h->slice_type_nos == FF_B_TYPE ) {
5506         if( mb_type < 23 ){
5507             partition_count= b_mb_type_info[mb_type].partition_count;
5508             mb_type=         b_mb_type_info[mb_type].type;
5509         }else{
5510             mb_type -= 23;
5511             goto decode_intra_mb;
5512         }
5513     } else if( h->slice_type_nos == FF_P_TYPE ) {
5514         if( mb_type < 5) {
5515             partition_count= p_mb_type_info[mb_type].partition_count;
5516             mb_type=         p_mb_type_info[mb_type].type;
5517         } else {
5518             mb_type -= 5;
5519             goto decode_intra_mb;
5520         }
5521     } else {
5522         if(h->slice_type == FF_SI_TYPE && mb_type)
5523             mb_type--;
5524         assert(h->slice_type_nos == FF_I_TYPE);
5525 decode_intra_mb:
5526         partition_count = 0;
5527         cbp= i_mb_type_info[mb_type].cbp;
5528         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5529         mb_type= i_mb_type_info[mb_type].type;
5530     }
5531     if(MB_FIELD)
5532         mb_type |= MB_TYPE_INTERLACED;
5533
5534     h->slice_table[ mb_xy ]= h->slice_num;
5535
5536     if(IS_INTRA_PCM(mb_type)) {
5537         const uint8_t *ptr;
5538
5539         // We assume these blocks are very rare so we do not optimize it.
5540         // FIXME The two following lines get the bitstream position in the cabac
5541         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5542         ptr= h->cabac.bytestream;
5543         if(h->cabac.low&0x1) ptr--;
5544         if(CABAC_BITS==16){
5545             if(h->cabac.low&0x1FF) ptr--;
5546         }
5547
5548         // The pixels are stored in the same order as levels in h->mb array.
5549         memcpy(h->mb, ptr, 256); ptr+=256;
5550         if(CHROMA){
5551             memcpy(h->mb+128, ptr, 128); ptr+=128;
5552         }
5553
5554         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5555
5556         // All blocks are present
5557         h->cbp_table[mb_xy] = 0x1ef;
5558         h->chroma_pred_mode_table[mb_xy] = 0;
5559         // In deblocking, the quantizer is 0
5560         s->current_picture.qscale_table[mb_xy]= 0;
5561         // All coeffs are present
5562         memset(h->non_zero_count[mb_xy], 16, 16);
5563         s->current_picture.mb_type[mb_xy]= mb_type;
5564         h->last_qscale_diff = 0;
5565         return 0;
5566     }
5567
5568     if(MB_MBAFF){
5569         h->ref_count[0] <<= 1;
5570         h->ref_count[1] <<= 1;
5571     }
5572
5573     fill_caches(h, mb_type, 0);
5574
5575     if( IS_INTRA( mb_type ) ) {
5576         int i, pred_mode;
5577         if( IS_INTRA4x4( mb_type ) ) {
5578             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5579                 mb_type |= MB_TYPE_8x8DCT;
5580                 for( i = 0; i < 16; i+=4 ) {
5581                     int pred = pred_intra_mode( h, i );
5582                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5583                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5584                 }
5585             } else {
5586                 for( i = 0; i < 16; i++ ) {
5587                     int pred = pred_intra_mode( h, i );
5588                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5589
5590                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5591                 }
5592             }
5593             write_back_intra_pred_mode(h);
5594             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5595         } else {
5596             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5597             if( h->intra16x16_pred_mode < 0 ) return -1;
5598         }
5599         if(CHROMA){
5600             h->chroma_pred_mode_table[mb_xy] =
5601             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5602
5603             pred_mode= check_intra_pred_mode( h, pred_mode );
5604             if( pred_mode < 0 ) return -1;
5605             h->chroma_pred_mode= pred_mode;
5606         }
5607     } else if( partition_count == 4 ) {
5608         int i, j, sub_partition_count[4], list, ref[2][4];
5609
5610         if( h->slice_type_nos == FF_B_TYPE ) {
5611             for( i = 0; i < 4; i++ ) {
5612                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5613                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5614                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5615             }
5616             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5617                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5618                 pred_direct_motion(h, &mb_type);
5619                 h->ref_cache[0][scan8[4]] =
5620                 h->ref_cache[1][scan8[4]] =
5621                 h->ref_cache[0][scan8[12]] =
5622                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5623                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5624                     for( i = 0; i < 4; i++ )
5625                         if( IS_DIRECT(h->sub_mb_type[i]) )
5626                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5627                 }
5628             }
5629         } else {
5630             for( i = 0; i < 4; i++ ) {
5631                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5632                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5633                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5634             }
5635         }
5636
5637         for( list = 0; list < h->list_count; list++ ) {
5638                 for( i = 0; i < 4; i++ ) {
5639                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5640                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5641                         if( h->ref_count[list] > 1 )
5642                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5643                         else
5644                             ref[list][i] = 0;
5645                     } else {
5646                         ref[list][i] = -1;
5647                     }
5648                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5649                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5650                 }
5651         }
5652
5653         if(dct8x8_allowed)
5654             dct8x8_allowed = get_dct8x8_allowed(h);
5655
5656         for(list=0; list<h->list_count; list++){
5657             for(i=0; i<4; i++){
5658                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5659                 if(IS_DIRECT(h->sub_mb_type[i])){
5660                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5661                     continue;
5662                 }
5663
5664                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5665                     const int sub_mb_type= h->sub_mb_type[i];
5666                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5667                     for(j=0; j<sub_partition_count[i]; j++){
5668                         int mpx, mpy;
5669                         int mx, my;
5670                         const int index= 4*i + block_width*j;
5671                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5672                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5673                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5674
5675                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5676                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5677                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5678
5679                         if(IS_SUB_8X8(sub_mb_type)){
5680                             mv_cache[ 1 ][0]=
5681                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5682                             mv_cache[ 1 ][1]=
5683                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5684
5685                             mvd_cache[ 1 ][0]=
5686                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5687                             mvd_cache[ 1 ][1]=
5688                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5689                         }else if(IS_SUB_8X4(sub_mb_type)){
5690                             mv_cache[ 1 ][0]= mx;
5691                             mv_cache[ 1 ][1]= my;
5692
5693                             mvd_cache[ 1 ][0]= mx - mpx;
5694                             mvd_cache[ 1 ][1]= my - mpy;
5695                         }else if(IS_SUB_4X8(sub_mb_type)){
5696                             mv_cache[ 8 ][0]= mx;
5697                             mv_cache[ 8 ][1]= my;
5698
5699                             mvd_cache[ 8 ][0]= mx - mpx;
5700                             mvd_cache[ 8 ][1]= my - mpy;
5701                         }
5702                         mv_cache[ 0 ][0]= mx;
5703                         mv_cache[ 0 ][1]= my;
5704
5705                         mvd_cache[ 0 ][0]= mx - mpx;
5706                         mvd_cache[ 0 ][1]= my - mpy;
5707                     }
5708                 }else{
5709                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5710                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5711                     p[0] = p[1] = p[8] = p[9] = 0;
5712                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5713                 }
5714             }
5715         }
5716     } else if( IS_DIRECT(mb_type) ) {
5717         pred_direct_motion(h, &mb_type);
5718         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5719         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5720         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5721     } else {
5722         int list, mx, my, i, mpx, mpy;
5723         if(IS_16X16(mb_type)){
5724             for(list=0; list<h->list_count; list++){
5725                 if(IS_DIR(mb_type, 0, list)){
5726                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5727                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5728                 }else
5729                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5730             }
5731             for(list=0; list<h->list_count; list++){
5732                 if(IS_DIR(mb_type, 0, list)){
5733                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5734
5735                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5736                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5737                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5738
5739                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5740                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5741                 }else
5742                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5743             }
5744         }
5745         else if(IS_16X8(mb_type)){
5746             for(list=0; list<h->list_count; list++){
5747                     for(i=0; i<2; i++){
5748                         if(IS_DIR(mb_type, i, list)){
5749                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5750                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5751                         }else
5752                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5753                     }
5754             }
5755             for(list=0; list<h->list_count; list++){
5756                 for(i=0; i<2; i++){
5757                     if(IS_DIR(mb_type, i, list)){
5758                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5759                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5760                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5761                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5762
5763                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5764                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5765                     }else{
5766                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5767                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5768                     }
5769                 }
5770             }
5771         }else{
5772             assert(IS_8X16(mb_type));
5773             for(list=0; list<h->list_count; list++){
5774                     for(i=0; i<2; i++){
5775                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5776                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5777                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5778                         }else
5779                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5780                     }
5781             }
5782             for(list=0; list<h->list_count; list++){
5783                 for(i=0; i<2; i++){
5784                     if(IS_DIR(mb_type, i, list)){
5785                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5786                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5787                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5788
5789                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5790                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5791                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5792                     }else{
5793                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5794                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5795                     }
5796                 }
5797             }
5798         }
5799     }
5800
5801    if( IS_INTER( mb_type ) ) {
5802         h->chroma_pred_mode_table[mb_xy] = 0;
5803         write_back_motion( h, mb_type );
5804    }
5805
5806     if( !IS_INTRA16x16( mb_type ) ) {
5807         cbp  = decode_cabac_mb_cbp_luma( h );
5808         if(CHROMA)
5809             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5810     }
5811
5812     h->cbp_table[mb_xy] = h->cbp = cbp;
5813
5814     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5815         if( decode_cabac_mb_transform_size( h ) )
5816             mb_type |= MB_TYPE_8x8DCT;
5817     }
5818     s->current_picture.mb_type[mb_xy]= mb_type;
5819
5820     if( cbp || IS_INTRA16x16( mb_type ) ) {
5821         const uint8_t *scan, *scan8x8, *dc_scan;
5822         const uint32_t *qmul;
5823         int dqp;
5824
5825         if(IS_INTERLACED(mb_type)){
5826             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5827             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5828             dc_scan= luma_dc_field_scan;
5829         }else{
5830             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5831             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5832             dc_scan= luma_dc_zigzag_scan;
5833         }
5834
5835         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5836         if( dqp == INT_MIN ){
5837             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5838             return -1;
5839         }
5840         s->qscale += dqp;
5841         if(((unsigned)s->qscale) > 51){
5842             if(s->qscale<0) s->qscale+= 52;
5843             else            s->qscale-= 52;
5844         }
5845         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5846         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5847
5848         if( IS_INTRA16x16( mb_type ) ) {
5849             int i;
5850             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5851             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5852
5853             if( cbp&15 ) {
5854                 qmul = h->dequant4_coeff[0][s->qscale];
5855                 for( i = 0; i < 16; i++ ) {
5856                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5857                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5858                 }
5859             } else {
5860                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5861             }
5862         } else {
5863             int i8x8, i4x4;
5864             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5865                 if( cbp & (1<<i8x8) ) {
5866                     if( IS_8x8DCT(mb_type) ) {
5867                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5868                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5869                     } else {
5870                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5871                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5872                             const int index = 4*i8x8 + i4x4;
5873                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5874 //START_TIMER
5875                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5876 //STOP_TIMER("decode_residual")
5877                         }
5878                     }
5879                 } else {
5880                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5881                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5882                 }
5883             }
5884         }
5885
5886         if( cbp&0x30 ){
5887             int c;
5888             for( c = 0; c < 2; c++ ) {
5889                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5890                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5891             }
5892         }
5893
5894         if( cbp&0x20 ) {
5895             int c, i;
5896             for( c = 0; c < 2; c++ ) {
5897                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5898                 for( i = 0; i < 4; i++ ) {
5899                     const int index = 16 + 4 * c + i;
5900                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5901                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5902                 }
5903             }
5904         } else {
5905             uint8_t * const nnz= &h->non_zero_count_cache[0];
5906             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5907             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5908         }
5909     } else {
5910         uint8_t * const nnz= &h->non_zero_count_cache[0];
5911         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5912         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5913         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5914         h->last_qscale_diff = 0;
5915     }
5916
5917     s->current_picture.qscale_table[mb_xy]= s->qscale;
5918     write_back_non_zero_count(h);
5919
5920     if(MB_MBAFF){
5921         h->ref_count[0] >>= 1;
5922         h->ref_count[1] >>= 1;
5923     }
5924
5925     return 0;
5926 }
5927
5928
5929 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5930     int i, d;
5931     const int index_a = qp + h->slice_alpha_c0_offset;
5932     const int alpha = (alpha_table+52)[index_a];
5933     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5934
5935     if( bS[0] < 4 ) {
5936         int8_t tc[4];
5937         for(i=0; i<4; i++)
5938             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5939         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5940     } else {
5941         /* 16px edge length, because bS=4 is triggered by being at
5942          * the edge of an intra MB, so all 4 bS are the same */
5943             for( d = 0; d < 16; d++ ) {
5944                 const int p0 = pix[-1];
5945                 const int p1 = pix[-2];
5946                 const int p2 = pix[-3];
5947
5948                 const int q0 = pix[0];
5949                 const int q1 = pix[1];
5950                 const int q2 = pix[2];
5951
5952                 if( FFABS( p0 - q0 ) < alpha &&
5953                     FFABS( p1 - p0 ) < beta &&
5954                     FFABS( q1 - q0 ) < beta ) {
5955
5956                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5957                         if( FFABS( p2 - p0 ) < beta)
5958                         {
5959                             const int p3 = pix[-4];
5960                             /* p0', p1', p2' */
5961                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5962                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5963                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5964                         } else {
5965                             /* p0' */
5966                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5967                         }
5968                         if( FFABS( q2 - q0 ) < beta)
5969                         {
5970                             const int q3 = pix[3];
5971                             /* q0', q1', q2' */
5972                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5973                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5974                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5975                         } else {
5976                             /* q0' */
5977                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5978                         }
5979                     }else{
5980                         /* p0', q0' */
5981                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5982                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5983                     }
5984                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5985                 }
5986                 pix += stride;
5987             }
5988     }
5989 }
5990 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5991     int i;
5992     const int index_a = qp + h->slice_alpha_c0_offset;
5993     const int alpha = (alpha_table+52)[index_a];
5994     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5995
5996     if( bS[0] < 4 ) {
5997         int8_t tc[4];
5998         for(i=0; i<4; i++)
5999             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6000         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6001     } else {
6002         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6003     }
6004 }
6005
6006 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6007     int i;
6008     for( i = 0; i < 16; i++, pix += stride) {
6009         int index_a;
6010         int alpha;
6011         int beta;
6012
6013         int qp_index;
6014         int bS_index = (i >> 1);
6015         if (!MB_FIELD) {
6016             bS_index &= ~1;
6017             bS_index |= (i & 1);
6018         }
6019
6020         if( bS[bS_index] == 0 ) {
6021             continue;
6022         }
6023
6024         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6025         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6026         alpha = (alpha_table+52)[index_a];
6027         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6028
6029         if( bS[bS_index] < 4 ) {
6030             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6031             const int p0 = pix[-1];
6032             const int p1 = pix[-2];
6033             const int p2 = pix[-3];
6034             const int q0 = pix[0];
6035             const int q1 = pix[1];
6036             const int q2 = pix[2];
6037
6038             if( FFABS( p0 - q0 ) < alpha &&
6039                 FFABS( p1 - p0 ) < beta &&
6040                 FFABS( q1 - q0 ) < beta ) {
6041                 int tc = tc0;
6042                 int i_delta;
6043
6044                 if( FFABS( p2 - p0 ) < beta ) {
6045                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6046                     tc++;
6047                 }
6048                 if( FFABS( q2 - q0 ) < beta ) {
6049                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6050                     tc++;
6051                 }
6052
6053                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6054                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6055                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6056                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6057             }
6058         }else{
6059             const int p0 = pix[-1];
6060             const int p1 = pix[-2];
6061             const int p2 = pix[-3];
6062
6063             const int q0 = pix[0];
6064             const int q1 = pix[1];
6065             const int q2 = pix[2];
6066
6067             if( FFABS( p0 - q0 ) < alpha &&
6068                 FFABS( p1 - p0 ) < beta &&
6069                 FFABS( q1 - q0 ) < beta ) {
6070
6071                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6072                     if( FFABS( p2 - p0 ) < beta)
6073                     {
6074                         const int p3 = pix[-4];
6075                         /* p0', p1', p2' */
6076                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6077                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6078                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6079                     } else {
6080                         /* p0' */
6081                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6082                     }
6083                     if( FFABS( q2 - q0 ) < beta)
6084                     {
6085                         const int q3 = pix[3];
6086                         /* q0', q1', q2' */
6087                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6088                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6089                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6090                     } else {
6091                         /* q0' */
6092                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6093                     }
6094                 }else{
6095                     /* p0', q0' */
6096                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6097                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6098                 }
6099                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6100             }
6101         }
6102     }
6103 }
6104 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6105     int i;
6106     for( i = 0; i < 8; i++, pix += stride) {
6107         int index_a;
6108         int alpha;
6109         int beta;
6110
6111         int qp_index;
6112         int bS_index = i;
6113
6114         if( bS[bS_index] == 0 ) {
6115             continue;
6116         }
6117
6118         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6119         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6120         alpha = (alpha_table+52)[index_a];
6121         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6122
6123         if( bS[bS_index] < 4 ) {
6124             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6125             const int p0 = pix[-1];
6126             const int p1 = pix[-2];
6127             const int q0 = pix[0];
6128             const int q1 = pix[1];
6129
6130             if( FFABS( p0 - q0 ) < alpha &&
6131                 FFABS( p1 - p0 ) < beta &&
6132                 FFABS( q1 - q0 ) < beta ) {
6133                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6134
6135                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6136                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6137                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6138             }
6139         }else{
6140             const int p0 = pix[-1];
6141             const int p1 = pix[-2];
6142             const int q0 = pix[0];
6143             const int q1 = pix[1];
6144
6145             if( FFABS( p0 - q0 ) < alpha &&
6146                 FFABS( p1 - p0 ) < beta &&
6147                 FFABS( q1 - q0 ) < beta ) {
6148
6149                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6150                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6151                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6152             }
6153         }
6154     }
6155 }
6156
6157 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6158     int i, d;
6159     const int index_a = qp + h->slice_alpha_c0_offset;
6160     const int alpha = (alpha_table+52)[index_a];
6161     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6162     const int pix_next  = stride;
6163
6164     if( bS[0] < 4 ) {
6165         int8_t tc[4];
6166         for(i=0; i<4; i++)
6167             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6168         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6169     } else {
6170         /* 16px edge length, see filter_mb_edgev */
6171             for( d = 0; d < 16; d++ ) {
6172                 const int p0 = pix[-1*pix_next];
6173                 const int p1 = pix[-2*pix_next];
6174                 const int p2 = pix[-3*pix_next];
6175                 const int q0 = pix[0];
6176                 const int q1 = pix[1*pix_next];
6177                 const int q2 = pix[2*pix_next];
6178
6179                 if( FFABS( p0 - q0 ) < alpha &&
6180                     FFABS( p1 - p0 ) < beta &&
6181                     FFABS( q1 - q0 ) < beta ) {
6182
6183                     const int p3 = pix[-4*pix_next];
6184                     const int q3 = pix[ 3*pix_next];
6185
6186                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6187                         if( FFABS( p2 - p0 ) < beta) {
6188                             /* p0', p1', p2' */
6189                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6190                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6191                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6192                         } else {
6193                             /* p0' */
6194                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6195                         }
6196                         if( FFABS( q2 - q0 ) < beta) {
6197                             /* q0', q1', q2' */
6198                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6199                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6200                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6201                         } else {
6202                             /* q0' */
6203                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6204                         }
6205                     }else{
6206                         /* p0', q0' */
6207                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6208                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6209                     }
6210                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6211                 }
6212                 pix++;
6213             }
6214     }
6215 }
6216
6217 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6218     int i;
6219     const int index_a = qp + h->slice_alpha_c0_offset;
6220     const int alpha = (alpha_table+52)[index_a];
6221     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6222
6223     if( bS[0] < 4 ) {
6224         int8_t tc[4];
6225         for(i=0; i<4; i++)
6226             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6227         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6228     } else {
6229         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6230     }
6231 }
6232
6233 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6234     MpegEncContext * const s = &h->s;
6235     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6236     int mb_xy, mb_type;
6237     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6238
6239     mb_xy = h->mb_xy;
6240
6241     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6242 1 ||
6243        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6244                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6245         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6246         return;
6247     }
6248     assert(!FRAME_MBAFF);
6249
6250     mb_type = s->current_picture.mb_type[mb_xy];
6251     qp = s->current_picture.qscale_table[mb_xy];
6252     qp0 = s->current_picture.qscale_table[mb_xy-1];
6253     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6254     qpc = get_chroma_qp( h, 0, qp );
6255     qpc0 = get_chroma_qp( h, 0, qp0 );
6256     qpc1 = get_chroma_qp( h, 0, qp1 );
6257     qp0 = (qp + qp0 + 1) >> 1;
6258     qp1 = (qp + qp1 + 1) >> 1;
6259     qpc0 = (qpc + qpc0 + 1) >> 1;
6260     qpc1 = (qpc + qpc1 + 1) >> 1;
6261     qp_thresh = 15 - h->slice_alpha_c0_offset;
6262     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6263        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6264         return;
6265
6266     if( IS_INTRA(mb_type) ) {
6267         int16_t bS4[4] = {4,4,4,4};
6268         int16_t bS3[4] = {3,3,3,3};
6269         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6270         if( IS_8x8DCT(mb_type) ) {
6271             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6272             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6273             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6274             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6275         } else {
6276             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6277             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6278             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6279             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6280             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6281             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6282             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6283             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6284         }
6285         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6286         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6287         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6288         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6289         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6290         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6291         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6292         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6293         return;
6294     } else {
6295         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6296         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6297         int edges;
6298         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6299             edges = 4;
6300             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6301         } else {
6302             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6303                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6304             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6305                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6306                              ? 3 : 0;
6307             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6308             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6309             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6310                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6311         }
6312         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6313             bSv[0][0] = 0x0004000400040004ULL;
6314         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6315             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6316
6317 #define FILTER(hv,dir,edge)\
6318         if(bSv[dir][edge]) {\
6319             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6320             if(!(edge&1)) {\
6321                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6322                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6323             }\
6324         }
6325         if( edges == 1 ) {
6326             FILTER(v,0,0);
6327             FILTER(h,1,0);
6328         } else if( IS_8x8DCT(mb_type) ) {
6329             FILTER(v,0,0);
6330             FILTER(v,0,2);
6331             FILTER(h,1,0);
6332             FILTER(h,1,2);
6333         } else {
6334             FILTER(v,0,0);
6335             FILTER(v,0,1);
6336             FILTER(v,0,2);
6337             FILTER(v,0,3);
6338             FILTER(h,1,0);
6339             FILTER(h,1,1);
6340             FILTER(h,1,2);
6341             FILTER(h,1,3);
6342         }
6343 #undef FILTER
6344     }
6345 }
6346
6347 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6348     MpegEncContext * const s = &h->s;
6349     const int mb_xy= mb_x + mb_y*s->mb_stride;
6350     const int mb_type = s->current_picture.mb_type[mb_xy];
6351     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6352     int first_vertical_edge_done = 0;
6353     int dir;
6354
6355     //for sufficiently low qp, filtering wouldn't do anything
6356     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6357     if(!FRAME_MBAFF){
6358         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6359         int qp = s->current_picture.qscale_table[mb_xy];
6360         if(qp <= qp_thresh
6361            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6362            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6363             return;
6364         }
6365     }
6366
6367     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6368     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6369         int top_type, left_type[2];
6370         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6371         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6372         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6373
6374         if(IS_8x8DCT(top_type)){
6375             h->non_zero_count_cache[4+8*0]=
6376             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6377             h->non_zero_count_cache[6+8*0]=
6378             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6379         }
6380         if(IS_8x8DCT(left_type[0])){
6381             h->non_zero_count_cache[3+8*1]=
6382             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6383         }
6384         if(IS_8x8DCT(left_type[1])){
6385             h->non_zero_count_cache[3+8*3]=
6386             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6387         }
6388
6389         if(IS_8x8DCT(mb_type)){
6390             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6391             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6392
6393             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6394             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6395
6396             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6397             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6398
6399             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6400             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6401         }
6402     }
6403
6404     if (FRAME_MBAFF
6405             // left mb is in picture
6406             && h->slice_table[mb_xy-1] != 255
6407             // and current and left pair do not have the same interlaced type
6408             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6409             // and left mb is in the same slice if deblocking_filter == 2
6410             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6411         /* First vertical edge is different in MBAFF frames
6412          * There are 8 different bS to compute and 2 different Qp
6413          */
6414         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6415         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6416         int16_t bS[8];
6417         int qp[2];
6418         int bqp[2];
6419         int rqp[2];
6420         int mb_qp, mbn0_qp, mbn1_qp;
6421         int i;
6422         first_vertical_edge_done = 1;
6423
6424         if( IS_INTRA(mb_type) )
6425             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6426         else {
6427             for( i = 0; i < 8; i++ ) {
6428                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6429
6430                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6431                     bS[i] = 4;
6432                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6433                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6434                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6435                     bS[i] = 2;
6436                 else
6437                     bS[i] = 1;
6438             }
6439         }
6440
6441         mb_qp = s->current_picture.qscale_table[mb_xy];
6442         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6443         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6444         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6445         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6446                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6447         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6448                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6449         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6450         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6451                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6452         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6453                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6454
6455         /* Filter edge */
6456         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6457         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6458         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6459         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6460         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6461     }
6462     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6463     for( dir = 0; dir < 2; dir++ )
6464     {
6465         int edge;
6466         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6467         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6468         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6469         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6470         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6471
6472         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6473                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6474         // how often to recheck mv-based bS when iterating between edges
6475         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6476                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6477         // how often to recheck mv-based bS when iterating along each edge
6478         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6479
6480         if (first_vertical_edge_done) {
6481             start = 1;
6482             first_vertical_edge_done = 0;
6483         }
6484
6485         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6486             start = 1;
6487
6488         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6489             && !IS_INTERLACED(mb_type)
6490             && IS_INTERLACED(mbm_type)
6491             ) {
6492             // This is a special case in the norm where the filtering must
6493             // be done twice (one each of the field) even if we are in a
6494             // frame macroblock.
6495             //
6496             static const int nnz_idx[4] = {4,5,6,3};
6497             unsigned int tmp_linesize   = 2 *   linesize;
6498             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6499             int mbn_xy = mb_xy - 2 * s->mb_stride;
6500             int qp;
6501             int i, j;
6502             int16_t bS[4];
6503
6504             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6505                 if( IS_INTRA(mb_type) ||
6506                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6507                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6508                 } else {
6509                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6510                     for( i = 0; i < 4; i++ ) {
6511                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6512                             mbn_nnz[nnz_idx[i]] != 0 )
6513                             bS[i] = 2;
6514                         else
6515                             bS[i] = 1;
6516                     }
6517                 }
6518                 // Do not use s->qscale as luma quantizer because it has not the same
6519                 // value in IPCM macroblocks.
6520                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6521                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6522                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6523                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6524                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6525                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6526                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6527                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6528             }
6529
6530             start = 1;
6531         }
6532
6533         /* Calculate bS */
6534         for( edge = start; edge < edges; edge++ ) {
6535             /* mbn_xy: neighbor macroblock */
6536             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6537             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6538             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6539             int16_t bS[4];
6540             int qp;
6541
6542             if( (edge&1) && IS_8x8DCT(mb_type) )
6543                 continue;
6544
6545             if( IS_INTRA(mb_type) ||
6546                 IS_INTRA(mbn_type) ) {
6547                 int value;
6548                 if (edge == 0) {
6549                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6550                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6551                     ) {
6552                         value = 4;
6553                     } else {
6554                         value = 3;
6555                     }
6556                 } else {
6557                     value = 3;
6558                 }
6559                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6560             } else {
6561                 int i, l;
6562                 int mv_done;
6563
6564                 if( edge & mask_edge ) {
6565                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6566                     mv_done = 1;
6567                 }
6568                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6569                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6570                     mv_done = 1;
6571                 }
6572                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6573                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6574                     int bn_idx= b_idx - (dir ? 8:1);
6575                     int v = 0;
6576
6577                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6578                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6579                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6580                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6581                     }
6582
6583                     if(h->slice_type_nos == FF_B_TYPE && v){
6584                         v=0;
6585                         for( l = 0; !v && l < 2; l++ ) {
6586                             int ln= 1-l;
6587                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6588                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6589                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6590                         }
6591                     }
6592
6593                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6594                     mv_done = 1;
6595                 }
6596                 else
6597                     mv_done = 0;
6598
6599                 for( i = 0; i < 4; i++ ) {
6600                     int x = dir == 0 ? edge : i;
6601                     int y = dir == 0 ? i    : edge;
6602                     int b_idx= 8 + 4 + x + 8*y;
6603                     int bn_idx= b_idx - (dir ? 8:1);
6604
6605                     if( h->non_zero_count_cache[b_idx] != 0 ||
6606                         h->non_zero_count_cache[bn_idx] != 0 ) {
6607                         bS[i] = 2;
6608                     }
6609                     else if(!mv_done)
6610                     {
6611                         bS[i] = 0;
6612                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6613                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6614                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6615                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6616                                 bS[i] = 1;
6617                                 break;
6618                             }
6619                         }
6620
6621                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6622                             bS[i] = 0;
6623                             for( l = 0; l < 2; l++ ) {
6624                                 int ln= 1-l;
6625                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6626                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6627                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6628                                     bS[i] = 1;
6629                                     break;
6630                                 }
6631                             }
6632                         }
6633                     }
6634                 }
6635
6636                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6637                     continue;
6638             }
6639
6640             /* Filter edge */
6641             // Do not use s->qscale as luma quantizer because it has not the same
6642             // value in IPCM macroblocks.
6643             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6644             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6645             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6646             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6647             if( dir == 0 ) {
6648                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6649                 if( (edge&1) == 0 ) {
6650                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6651                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6652                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6653                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6654                 }
6655             } else {
6656                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6657                 if( (edge&1) == 0 ) {
6658                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6659                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6660                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6661                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6662                 }
6663             }
6664         }
6665     }
6666 }
6667
6668 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6669     MpegEncContext * const s = &h->s;
6670     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6671
6672     s->mb_skip_run= -1;
6673
6674     if( h->pps.cabac ) {
6675         int i;
6676
6677         /* realign */
6678         align_get_bits( &s->gb );
6679
6680         /* init cabac */
6681         ff_init_cabac_states( &h->cabac);
6682         ff_init_cabac_decoder( &h->cabac,
6683                                s->gb.buffer + get_bits_count(&s->gb)/8,
6684                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6685         /* calculate pre-state */
6686         for( i= 0; i < 460; i++ ) {
6687             int pre;
6688             if( h->slice_type_nos == FF_I_TYPE )
6689                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6690             else
6691                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6692
6693             if( pre <= 63 )
6694                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6695             else
6696                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6697         }
6698
6699         for(;;){
6700 //START_TIMER
6701             int ret = decode_mb_cabac(h);
6702             int eos;
6703 //STOP_TIMER("decode_mb_cabac")
6704
6705             if(ret>=0) hl_decode_mb(h);
6706
6707             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6708                 s->mb_y++;
6709
6710                 if(ret>=0) ret = decode_mb_cabac(h);
6711
6712                 if(ret>=0) hl_decode_mb(h);
6713                 s->mb_y--;
6714             }
6715             eos = get_cabac_terminate( &h->cabac );
6716
6717             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6718                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6719                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6720                 return -1;
6721             }
6722
6723             if( ++s->mb_x >= s->mb_width ) {
6724                 s->mb_x = 0;
6725                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6726                 ++s->mb_y;
6727                 if(FIELD_OR_MBAFF_PICTURE) {
6728                     ++s->mb_y;
6729                 }
6730             }
6731
6732             if( eos || s->mb_y >= s->mb_height ) {
6733                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6734                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6735                 return 0;
6736             }
6737         }
6738
6739     } else {
6740         for(;;){
6741             int ret = decode_mb_cavlc(h);
6742
6743             if(ret>=0) hl_decode_mb(h);
6744
6745             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6746                 s->mb_y++;
6747                 ret = decode_mb_cavlc(h);
6748
6749                 if(ret>=0) hl_decode_mb(h);
6750                 s->mb_y--;
6751             }
6752
6753             if(ret<0){
6754                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6755                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6756
6757                 return -1;
6758             }
6759
6760             if(++s->mb_x >= s->mb_width){
6761                 s->mb_x=0;
6762                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6763                 ++s->mb_y;
6764                 if(FIELD_OR_MBAFF_PICTURE) {
6765                     ++s->mb_y;
6766                 }
6767                 if(s->mb_y >= s->mb_height){
6768                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6769
6770                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6771                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6772
6773                         return 0;
6774                     }else{
6775                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6776
6777                         return -1;
6778                     }
6779                 }
6780             }
6781
6782             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6783                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6784                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6785                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6786
6787                     return 0;
6788                 }else{
6789                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6790
6791                     return -1;
6792                 }
6793             }
6794         }
6795     }
6796
6797 #if 0
6798     for(;s->mb_y < s->mb_height; s->mb_y++){
6799         for(;s->mb_x < s->mb_width; s->mb_x++){
6800             int ret= decode_mb(h);
6801
6802             hl_decode_mb(h);
6803
6804             if(ret<0){
6805                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6806                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6807
6808                 return -1;
6809             }
6810
6811             if(++s->mb_x >= s->mb_width){
6812                 s->mb_x=0;
6813                 if(++s->mb_y >= s->mb_height){
6814                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6815                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6816
6817                         return 0;
6818                     }else{
6819                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6820
6821                         return -1;
6822                     }
6823                 }
6824             }
6825
6826             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6827                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6828                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6829
6830                     return 0;
6831                 }else{
6832                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6833
6834                     return -1;
6835                 }
6836             }
6837         }
6838         s->mb_x=0;
6839         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6840     }
6841 #endif
6842     return -1; //not reached
6843 }
6844
6845 static int decode_unregistered_user_data(H264Context *h, int size){
6846     MpegEncContext * const s = &h->s;
6847     uint8_t user_data[16+256];
6848     int e, build, i;
6849
6850     if(size<16)
6851         return -1;
6852
6853     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6854         user_data[i]= get_bits(&s->gb, 8);
6855     }
6856
6857     user_data[i]= 0;
6858     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6859     if(e==1 && build>=0)
6860         h->x264_build= build;
6861
6862     if(s->avctx->debug & FF_DEBUG_BUGS)
6863         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6864
6865     for(; i<size; i++)
6866         skip_bits(&s->gb, 8);
6867
6868     return 0;
6869 }
6870
6871 static int decode_sei(H264Context *h){
6872     MpegEncContext * const s = &h->s;
6873
6874     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6875         int size, type;
6876
6877         type=0;
6878         do{
6879             type+= show_bits(&s->gb, 8);
6880         }while(get_bits(&s->gb, 8) == 255);
6881
6882         size=0;
6883         do{
6884             size+= show_bits(&s->gb, 8);
6885         }while(get_bits(&s->gb, 8) == 255);
6886
6887         switch(type){
6888         case 5:
6889             if(decode_unregistered_user_data(h, size) < 0)
6890                 return -1;
6891             break;
6892         default:
6893             skip_bits(&s->gb, 8*size);
6894         }
6895
6896         //FIXME check bits here
6897         align_get_bits(&s->gb);
6898     }
6899
6900     return 0;
6901 }
6902
6903 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6904     MpegEncContext * const s = &h->s;
6905     int cpb_count, i;
6906     cpb_count = get_ue_golomb(&s->gb) + 1;
6907     get_bits(&s->gb, 4); /* bit_rate_scale */
6908     get_bits(&s->gb, 4); /* cpb_size_scale */
6909     for(i=0; i<cpb_count; i++){
6910         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6911         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6912         get_bits1(&s->gb);     /* cbr_flag */
6913     }
6914     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6915     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6916     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6917     get_bits(&s->gb, 5); /* time_offset_length */
6918 }
6919
6920 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6921     MpegEncContext * const s = &h->s;
6922     int aspect_ratio_info_present_flag;
6923     unsigned int aspect_ratio_idc;
6924     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6925
6926     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6927
6928     if( aspect_ratio_info_present_flag ) {
6929         aspect_ratio_idc= get_bits(&s->gb, 8);
6930         if( aspect_ratio_idc == EXTENDED_SAR ) {
6931             sps->sar.num= get_bits(&s->gb, 16);
6932             sps->sar.den= get_bits(&s->gb, 16);
6933         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6934             sps->sar=  pixel_aspect[aspect_ratio_idc];
6935         }else{
6936             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6937             return -1;
6938         }
6939     }else{
6940         sps->sar.num=
6941         sps->sar.den= 0;
6942     }
6943 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6944
6945     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6946         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6947     }
6948
6949     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6950         get_bits(&s->gb, 3);    /* video_format */
6951         get_bits1(&s->gb);      /* video_full_range_flag */
6952         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6953             get_bits(&s->gb, 8); /* colour_primaries */
6954             get_bits(&s->gb, 8); /* transfer_characteristics */
6955             get_bits(&s->gb, 8); /* matrix_coefficients */
6956         }
6957     }
6958
6959     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6960         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6961         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6962     }
6963
6964     sps->timing_info_present_flag = get_bits1(&s->gb);
6965     if(sps->timing_info_present_flag){
6966         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6967         sps->time_scale = get_bits_long(&s->gb, 32);
6968         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6969     }
6970
6971     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6972     if(nal_hrd_parameters_present_flag)
6973         decode_hrd_parameters(h, sps);
6974     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6975     if(vcl_hrd_parameters_present_flag)
6976         decode_hrd_parameters(h, sps);
6977     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6978         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6979     get_bits1(&s->gb);         /* pic_struct_present_flag */
6980
6981     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6982     if(sps->bitstream_restriction_flag){
6983         unsigned int num_reorder_frames;
6984         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6985         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6986         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6987         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6988         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6989         num_reorder_frames= get_ue_golomb(&s->gb);
6990         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6991
6992         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6993             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6994             return -1;
6995         }
6996
6997         sps->num_reorder_frames= num_reorder_frames;
6998     }
6999
7000     return 0;
7001 }
7002
7003 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7004                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7005     MpegEncContext * const s = &h->s;
7006     int i, last = 8, next = 8;
7007     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7008     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7009         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7010     else
7011     for(i=0;i<size;i++){
7012         if(next)
7013             next = (last + get_se_golomb(&s->gb)) & 0xff;
7014         if(!i && !next){ /* matrix not written, we use the preset one */
7015             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7016             break;
7017         }
7018         last = factors[scan[i]] = next ? next : last;
7019     }
7020 }
7021
7022 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7023                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7024     MpegEncContext * const s = &h->s;
7025     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7026     const uint8_t *fallback[4] = {
7027         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7028         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7029         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7030         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7031     };
7032     if(get_bits1(&s->gb)){
7033         sps->scaling_matrix_present |= is_sps;
7034         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7035         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7036         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7037         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7038         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7039         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7040         if(is_sps || pps->transform_8x8_mode){
7041             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7042             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7043         }
7044     } else if(fallback_sps) {
7045         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7046         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7047     }
7048 }
7049
7050 /**
7051  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7052  */
7053 static void *
7054 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7055                     const size_t size, const char *name)
7056 {
7057     if(id>=max) {
7058         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7059         return NULL;
7060     }
7061
7062     if(!vec[id]) {
7063         vec[id] = av_mallocz(size);
7064         if(vec[id] == NULL)
7065             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7066     }
7067     return vec[id];
7068 }
7069
7070 static inline int decode_seq_parameter_set(H264Context *h){
7071     MpegEncContext * const s = &h->s;
7072     int profile_idc, level_idc;
7073     unsigned int sps_id, tmp, mb_width, mb_height;
7074     int i;
7075     SPS *sps;
7076
7077     profile_idc= get_bits(&s->gb, 8);
7078     get_bits1(&s->gb);   //constraint_set0_flag
7079     get_bits1(&s->gb);   //constraint_set1_flag
7080     get_bits1(&s->gb);   //constraint_set2_flag
7081     get_bits1(&s->gb);   //constraint_set3_flag
7082     get_bits(&s->gb, 4); // reserved
7083     level_idc= get_bits(&s->gb, 8);
7084     sps_id= get_ue_golomb(&s->gb);
7085
7086     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7087     if(sps == NULL)
7088         return -1;
7089
7090     sps->profile_idc= profile_idc;
7091     sps->level_idc= level_idc;
7092
7093     if(sps->profile_idc >= 100){ //high profile
7094         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7095         if(sps->chroma_format_idc == 3)
7096             get_bits1(&s->gb);  //residual_color_transform_flag
7097         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7098         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7099         sps->transform_bypass = get_bits1(&s->gb);
7100         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7101     }else{
7102         sps->scaling_matrix_present = 0;
7103         sps->chroma_format_idc= 1;
7104     }
7105
7106     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7107     sps->poc_type= get_ue_golomb(&s->gb);
7108
7109     if(sps->poc_type == 0){ //FIXME #define
7110         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7111     } else if(sps->poc_type == 1){//FIXME #define
7112         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7113         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7114         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7115         tmp= get_ue_golomb(&s->gb);
7116
7117         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7118             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7119             return -1;
7120         }
7121         sps->poc_cycle_length= tmp;
7122
7123         for(i=0; i<sps->poc_cycle_length; i++)
7124             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7125     }else if(sps->poc_type != 2){
7126         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7127         return -1;
7128     }
7129
7130     tmp= get_ue_golomb(&s->gb);
7131     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7132         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7133         return -1;
7134     }
7135     sps->ref_frame_count= tmp;
7136     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7137     mb_width= get_ue_golomb(&s->gb) + 1;
7138     mb_height= get_ue_golomb(&s->gb) + 1;
7139     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7140        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7141         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7142         return -1;
7143     }
7144     sps->mb_width = mb_width;
7145     sps->mb_height= mb_height;
7146
7147     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7148     if(!sps->frame_mbs_only_flag)
7149         sps->mb_aff= get_bits1(&s->gb);
7150     else
7151         sps->mb_aff= 0;
7152
7153     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7154
7155 #ifndef ALLOW_INTERLACE
7156     if(sps->mb_aff)
7157         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7158 #endif
7159     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7160         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7161
7162     sps->crop= get_bits1(&s->gb);
7163     if(sps->crop){
7164         sps->crop_left  = get_ue_golomb(&s->gb);
7165         sps->crop_right = get_ue_golomb(&s->gb);
7166         sps->crop_top   = get_ue_golomb(&s->gb);
7167         sps->crop_bottom= get_ue_golomb(&s->gb);
7168         if(sps->crop_left || sps->crop_top){
7169             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7170         }
7171         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7172             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7173         }
7174     }else{
7175         sps->crop_left  =
7176         sps->crop_right =
7177         sps->crop_top   =
7178         sps->crop_bottom= 0;
7179     }
7180
7181     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7182     if( sps->vui_parameters_present_flag )
7183         decode_vui_parameters(h, sps);
7184
7185     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7186         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7187                sps_id, sps->profile_idc, sps->level_idc,
7188                sps->poc_type,
7189                sps->ref_frame_count,
7190                sps->mb_width, sps->mb_height,
7191                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7192                sps->direct_8x8_inference_flag ? "8B8" : "",
7193                sps->crop_left, sps->crop_right,
7194                sps->crop_top, sps->crop_bottom,
7195                sps->vui_parameters_present_flag ? "VUI" : "",
7196                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7197                );
7198     }
7199     return 0;
7200 }
7201
7202 static void
7203 build_qp_table(PPS *pps, int t, int index)
7204 {
7205     int i;
7206     for(i = 0; i < 52; i++)
7207         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7208 }
7209
7210 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7211     MpegEncContext * const s = &h->s;
7212     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7213     PPS *pps;
7214
7215     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7216     if(pps == NULL)
7217         return -1;
7218
7219     tmp= get_ue_golomb(&s->gb);
7220     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7221         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7222         return -1;
7223     }
7224     pps->sps_id= tmp;
7225
7226     pps->cabac= get_bits1(&s->gb);
7227     pps->pic_order_present= get_bits1(&s->gb);
7228     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7229     if(pps->slice_group_count > 1 ){
7230         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7231         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7232         switch(pps->mb_slice_group_map_type){
7233         case 0:
7234 #if 0
7235 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7236 |    run_length[ i ]                                |1  |ue(v)   |
7237 #endif
7238             break;
7239         case 2:
7240 #if 0
7241 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7242 |{                                                  |   |        |
7243 |    top_left_mb[ i ]                               |1  |ue(v)   |
7244 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7245 |   }                                               |   |        |
7246 #endif
7247             break;
7248         case 3:
7249         case 4:
7250         case 5:
7251 #if 0
7252 |   slice_group_change_direction_flag               |1  |u(1)    |
7253 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7254 #endif
7255             break;
7256         case 6:
7257 #if 0
7258 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7259 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7260 |)                                                  |   |        |
7261 |    slice_group_id[ i ]                            |1  |u(v)    |
7262 #endif
7263             break;
7264         }
7265     }
7266     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7267     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7268     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7269         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7270         pps->ref_count[0]= pps->ref_count[1]= 1;
7271         return -1;
7272     }
7273
7274     pps->weighted_pred= get_bits1(&s->gb);
7275     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7276     pps->init_qp= get_se_golomb(&s->gb) + 26;
7277     pps->init_qs= get_se_golomb(&s->gb) + 26;
7278     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7279     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7280     pps->constrained_intra_pred= get_bits1(&s->gb);
7281     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7282
7283     pps->transform_8x8_mode= 0;
7284     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7285     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7286     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7287
7288     if(get_bits_count(&s->gb) < bit_length){
7289         pps->transform_8x8_mode= get_bits1(&s->gb);
7290         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7291         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7292     } else {
7293         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7294     }
7295
7296     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7297     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7298     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7299         h->pps.chroma_qp_diff= 1;
7300
7301     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7302         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7303                pps_id, pps->sps_id,
7304                pps->cabac ? "CABAC" : "CAVLC",
7305                pps->slice_group_count,
7306                pps->ref_count[0], pps->ref_count[1],
7307                pps->weighted_pred ? "weighted" : "",
7308                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7309                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7310                pps->constrained_intra_pred ? "CONSTR" : "",
7311                pps->redundant_pic_cnt_present ? "REDU" : "",
7312                pps->transform_8x8_mode ? "8x8DCT" : ""
7313                );
7314     }
7315
7316     return 0;
7317 }
7318
7319 /**
7320  * Call decode_slice() for each context.
7321  *
7322  * @param h h264 master context
7323  * @param context_count number of contexts to execute
7324  */
7325 static void execute_decode_slices(H264Context *h, int context_count){
7326     MpegEncContext * const s = &h->s;
7327     AVCodecContext * const avctx= s->avctx;
7328     H264Context *hx;
7329     int i;
7330
7331     if(context_count == 1) {
7332         decode_slice(avctx, h);
7333     } else {
7334         for(i = 1; i < context_count; i++) {
7335             hx = h->thread_context[i];
7336             hx->s.error_resilience = avctx->error_resilience;
7337             hx->s.error_count = 0;
7338         }
7339
7340         avctx->execute(avctx, (void *)decode_slice,
7341                        (void **)h->thread_context, NULL, context_count);
7342
7343         /* pull back stuff from slices to master context */
7344         hx = h->thread_context[context_count - 1];
7345         s->mb_x = hx->s.mb_x;
7346         s->mb_y = hx->s.mb_y;
7347         s->dropable = hx->s.dropable;
7348         s->picture_structure = hx->s.picture_structure;
7349         for(i = 1; i < context_count; i++)
7350             h->s.error_count += h->thread_context[i]->s.error_count;
7351     }
7352 }
7353
7354
7355 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7356     MpegEncContext * const s = &h->s;
7357     AVCodecContext * const avctx= s->avctx;
7358     int buf_index=0;
7359     H264Context *hx; ///< thread context
7360     int context_count = 0;
7361
7362     h->max_contexts = avctx->thread_count;
7363 #if 0
7364     int i;
7365     for(i=0; i<50; i++){
7366         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7367     }
7368 #endif
7369     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7370         h->current_slice = 0;
7371         if (!s->first_field)
7372             s->current_picture_ptr= NULL;
7373     }
7374
7375     for(;;){
7376         int consumed;
7377         int dst_length;
7378         int bit_length;
7379         const uint8_t *ptr;
7380         int i, nalsize = 0;
7381         int err;
7382
7383         if(h->is_avc) {
7384             if(buf_index >= buf_size) break;
7385             nalsize = 0;
7386             for(i = 0; i < h->nal_length_size; i++)
7387                 nalsize = (nalsize << 8) | buf[buf_index++];
7388             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7389                 if(nalsize == 1){
7390                     buf_index++;
7391                     continue;
7392                 }else{
7393                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7394                     break;
7395                 }
7396             }
7397         } else {
7398             // start code prefix search
7399             for(; buf_index + 3 < buf_size; buf_index++){
7400                 // This should always succeed in the first iteration.
7401                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7402                     break;
7403             }
7404
7405             if(buf_index+3 >= buf_size) break;
7406
7407             buf_index+=3;
7408         }
7409
7410         hx = h->thread_context[context_count];
7411
7412         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7413         if (ptr==NULL || dst_length < 0){
7414             return -1;
7415         }
7416         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7417             dst_length--;
7418         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7419
7420         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7421             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7422         }
7423
7424         if (h->is_avc && (nalsize != consumed)){
7425             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7426             consumed= nalsize;
7427         }
7428
7429         buf_index += consumed;
7430
7431         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7432            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7433             continue;
7434
7435       again:
7436         err = 0;
7437         switch(hx->nal_unit_type){
7438         case NAL_IDR_SLICE:
7439             if (h->nal_unit_type != NAL_IDR_SLICE) {
7440                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7441                 return -1;
7442             }
7443             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7444         case NAL_SLICE:
7445             init_get_bits(&hx->s.gb, ptr, bit_length);
7446             hx->intra_gb_ptr=
7447             hx->inter_gb_ptr= &hx->s.gb;
7448             hx->s.data_partitioning = 0;
7449
7450             if((err = decode_slice_header(hx, h)))
7451                break;
7452
7453             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7454             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7455                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7456                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7457                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7458                && avctx->skip_frame < AVDISCARD_ALL)
7459                 context_count++;
7460             break;
7461         case NAL_DPA:
7462             init_get_bits(&hx->s.gb, ptr, bit_length);
7463             hx->intra_gb_ptr=
7464             hx->inter_gb_ptr= NULL;
7465             hx->s.data_partitioning = 1;
7466
7467             err = decode_slice_header(hx, h);
7468             break;
7469         case NAL_DPB:
7470             init_get_bits(&hx->intra_gb, ptr, bit_length);
7471             hx->intra_gb_ptr= &hx->intra_gb;
7472             break;
7473         case NAL_DPC:
7474             init_get_bits(&hx->inter_gb, ptr, bit_length);
7475             hx->inter_gb_ptr= &hx->inter_gb;
7476
7477             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7478                && s->context_initialized
7479                && s->hurry_up < 5
7480                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7481                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7482                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7483                && avctx->skip_frame < AVDISCARD_ALL)
7484                 context_count++;
7485             break;
7486         case NAL_SEI:
7487             init_get_bits(&s->gb, ptr, bit_length);
7488             decode_sei(h);
7489             break;
7490         case NAL_SPS:
7491             init_get_bits(&s->gb, ptr, bit_length);
7492             decode_seq_parameter_set(h);
7493
7494             if(s->flags& CODEC_FLAG_LOW_DELAY)
7495                 s->low_delay=1;
7496
7497             if(avctx->has_b_frames < 2)
7498                 avctx->has_b_frames= !s->low_delay;
7499             break;
7500         case NAL_PPS:
7501             init_get_bits(&s->gb, ptr, bit_length);
7502
7503             decode_picture_parameter_set(h, bit_length);
7504
7505             break;
7506         case NAL_AUD:
7507         case NAL_END_SEQUENCE:
7508         case NAL_END_STREAM:
7509         case NAL_FILLER_DATA:
7510         case NAL_SPS_EXT:
7511         case NAL_AUXILIARY_SLICE:
7512             break;
7513         default:
7514             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7515         }
7516
7517         if(context_count == h->max_contexts) {
7518             execute_decode_slices(h, context_count);
7519             context_count = 0;
7520         }
7521
7522         if (err < 0)
7523             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7524         else if(err == 1) {
7525             /* Slice could not be decoded in parallel mode, copy down
7526              * NAL unit stuff to context 0 and restart. Note that
7527              * rbsp_buffer is not transferred, but since we no longer
7528              * run in parallel mode this should not be an issue. */
7529             h->nal_unit_type = hx->nal_unit_type;
7530             h->nal_ref_idc   = hx->nal_ref_idc;
7531             hx = h;
7532             goto again;
7533         }
7534     }
7535     if(context_count)
7536         execute_decode_slices(h, context_count);
7537     return buf_index;
7538 }
7539
7540 /**
7541  * returns the number of bytes consumed for building the current frame
7542  */
7543 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7544         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7545         if(pos+10>buf_size) pos=buf_size; // oops ;)
7546
7547         return pos;
7548 }
7549
7550 static int decode_frame(AVCodecContext *avctx,
7551                              void *data, int *data_size,
7552                              const uint8_t *buf, int buf_size)
7553 {
7554     H264Context *h = avctx->priv_data;
7555     MpegEncContext *s = &h->s;
7556     AVFrame *pict = data;
7557     int buf_index;
7558
7559     s->flags= avctx->flags;
7560     s->flags2= avctx->flags2;
7561
7562    /* end of stream, output what is still in the buffers */
7563     if (buf_size == 0) {
7564         Picture *out;
7565         int i, out_idx;
7566
7567 //FIXME factorize this with the output code below
7568         out = h->delayed_pic[0];
7569         out_idx = 0;
7570         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7571             if(h->delayed_pic[i]->poc < out->poc){
7572                 out = h->delayed_pic[i];
7573                 out_idx = i;
7574             }
7575
7576         for(i=out_idx; h->delayed_pic[i]; i++)
7577             h->delayed_pic[i] = h->delayed_pic[i+1];
7578
7579         if(out){
7580             *data_size = sizeof(AVFrame);
7581             *pict= *(AVFrame*)out;
7582         }
7583
7584         return 0;
7585     }
7586
7587     if(h->is_avc && !h->got_avcC) {
7588         int i, cnt, nalsize;
7589         unsigned char *p = avctx->extradata;
7590         if(avctx->extradata_size < 7) {
7591             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7592             return -1;
7593         }
7594         if(*p != 1) {
7595             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7596             return -1;
7597         }
7598         /* sps and pps in the avcC always have length coded with 2 bytes,
7599            so put a fake nal_length_size = 2 while parsing them */
7600         h->nal_length_size = 2;
7601         // Decode sps from avcC
7602         cnt = *(p+5) & 0x1f; // Number of sps
7603         p += 6;
7604         for (i = 0; i < cnt; i++) {
7605             nalsize = AV_RB16(p) + 2;
7606             if(decode_nal_units(h, p, nalsize) < 0) {
7607                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7608                 return -1;
7609             }
7610             p += nalsize;
7611         }
7612         // Decode pps from avcC
7613         cnt = *(p++); // Number of pps
7614         for (i = 0; i < cnt; i++) {
7615             nalsize = AV_RB16(p) + 2;
7616             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7617                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7618                 return -1;
7619             }
7620             p += nalsize;
7621         }
7622         // Now store right nal length size, that will be use to parse all other nals
7623         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7624         // Do not reparse avcC
7625         h->got_avcC = 1;
7626     }
7627
7628     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7629         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7630             return -1;
7631     }
7632
7633     buf_index=decode_nal_units(h, buf, buf_size);
7634     if(buf_index < 0)
7635         return -1;
7636
7637     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7638         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7639         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7640         return -1;
7641     }
7642
7643     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7644         Picture *out = s->current_picture_ptr;
7645         Picture *cur = s->current_picture_ptr;
7646         int i, pics, cross_idr, out_of_order, out_idx;
7647
7648         s->mb_y= 0;
7649
7650         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7651         s->current_picture_ptr->pict_type= s->pict_type;
7652
7653         if(!s->dropable) {
7654             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7655             h->prev_poc_msb= h->poc_msb;
7656             h->prev_poc_lsb= h->poc_lsb;
7657         }
7658         h->prev_frame_num_offset= h->frame_num_offset;
7659         h->prev_frame_num= h->frame_num;
7660
7661         /*
7662          * FIXME: Error handling code does not seem to support interlaced
7663          * when slices span multiple rows
7664          * The ff_er_add_slice calls don't work right for bottom
7665          * fields; they cause massive erroneous error concealing
7666          * Error marking covers both fields (top and bottom).
7667          * This causes a mismatched s->error_count
7668          * and a bad error table. Further, the error count goes to
7669          * INT_MAX when called for bottom field, because mb_y is
7670          * past end by one (callers fault) and resync_mb_y != 0
7671          * causes problems for the first MB line, too.
7672          */
7673         if (!FIELD_PICTURE)
7674             ff_er_frame_end(s);
7675
7676         MPV_frame_end(s);
7677
7678         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7679             /* Wait for second field. */
7680             *data_size = 0;
7681
7682         } else {
7683             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7684             /* Derive top_field_first from field pocs. */
7685             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7686
7687         //FIXME do something with unavailable reference frames
7688
7689             /* Sort B-frames into display order */
7690
7691             if(h->sps.bitstream_restriction_flag
7692                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7693                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7694                 s->low_delay = 0;
7695             }
7696
7697             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7698                && !h->sps.bitstream_restriction_flag){
7699                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7700                 s->low_delay= 0;
7701             }
7702
7703             pics = 0;
7704             while(h->delayed_pic[pics]) pics++;
7705
7706             assert(pics <= MAX_DELAYED_PIC_COUNT);
7707
7708             h->delayed_pic[pics++] = cur;
7709             if(cur->reference == 0)
7710                 cur->reference = DELAYED_PIC_REF;
7711
7712             out = h->delayed_pic[0];
7713             out_idx = 0;
7714             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7715                 if(h->delayed_pic[i]->poc < out->poc){
7716                     out = h->delayed_pic[i];
7717                     out_idx = i;
7718                 }
7719             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7720
7721             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7722
7723             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7724                 { }
7725             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7726                || (s->low_delay &&
7727                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7728                  || cur->pict_type == FF_B_TYPE)))
7729             {
7730                 s->low_delay = 0;
7731                 s->avctx->has_b_frames++;
7732             }
7733
7734             if(out_of_order || pics > s->avctx->has_b_frames){
7735                 out->reference &= ~DELAYED_PIC_REF;
7736                 for(i=out_idx; h->delayed_pic[i]; i++)
7737                     h->delayed_pic[i] = h->delayed_pic[i+1];
7738             }
7739             if(!out_of_order && pics > s->avctx->has_b_frames){
7740                 *data_size = sizeof(AVFrame);
7741
7742                 h->outputed_poc = out->poc;
7743                 *pict= *(AVFrame*)out;
7744             }else{
7745                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7746             }
7747         }
7748     }
7749
7750     assert(pict->data[0] || !*data_size);
7751     ff_print_debug_info(s, pict);
7752 //printf("out %d\n", (int)pict->data[0]);
7753 #if 0 //?
7754
7755     /* Return the Picture timestamp as the frame number */
7756     /* we subtract 1 because it is added on utils.c     */
7757     avctx->frame_number = s->picture_number - 1;
7758 #endif
7759     return get_consumed_bytes(s, buf_index, buf_size);
7760 }
7761 #if 0
7762 static inline void fill_mb_avail(H264Context *h){
7763     MpegEncContext * const s = &h->s;
7764     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7765
7766     if(s->mb_y){
7767         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7768         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7769         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7770     }else{
7771         h->mb_avail[0]=
7772         h->mb_avail[1]=
7773         h->mb_avail[2]= 0;
7774     }
7775     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7776     h->mb_avail[4]= 1; //FIXME move out
7777     h->mb_avail[5]= 0; //FIXME move out
7778 }
7779 #endif
7780
7781 #ifdef TEST
7782 #undef printf
7783 #undef random
7784 #define COUNT 8000
7785 #define SIZE (COUNT*40)
7786 int main(void){
7787     int i;
7788     uint8_t temp[SIZE];
7789     PutBitContext pb;
7790     GetBitContext gb;
7791 //    int int_temp[10000];
7792     DSPContext dsp;
7793     AVCodecContext avctx;
7794
7795     dsputil_init(&dsp, &avctx);
7796
7797     init_put_bits(&pb, temp, SIZE);
7798     printf("testing unsigned exp golomb\n");
7799     for(i=0; i<COUNT; i++){
7800         START_TIMER
7801         set_ue_golomb(&pb, i);
7802         STOP_TIMER("set_ue_golomb");
7803     }
7804     flush_put_bits(&pb);
7805
7806     init_get_bits(&gb, temp, 8*SIZE);
7807     for(i=0; i<COUNT; i++){
7808         int j, s;
7809
7810         s= show_bits(&gb, 24);
7811
7812         START_TIMER
7813         j= get_ue_golomb(&gb);
7814         if(j != i){
7815             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7816 //            return -1;
7817         }
7818         STOP_TIMER("get_ue_golomb");
7819     }
7820
7821
7822     init_put_bits(&pb, temp, SIZE);
7823     printf("testing signed exp golomb\n");
7824     for(i=0; i<COUNT; i++){
7825         START_TIMER
7826         set_se_golomb(&pb, i - COUNT/2);
7827         STOP_TIMER("set_se_golomb");
7828     }
7829     flush_put_bits(&pb);
7830
7831     init_get_bits(&gb, temp, 8*SIZE);
7832     for(i=0; i<COUNT; i++){
7833         int j, s;
7834
7835         s= show_bits(&gb, 24);
7836
7837         START_TIMER
7838         j= get_se_golomb(&gb);
7839         if(j != i - COUNT/2){
7840             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7841 //            return -1;
7842         }
7843         STOP_TIMER("get_se_golomb");
7844     }
7845
7846 #if 0
7847     printf("testing 4x4 (I)DCT\n");
7848
7849     DCTELEM block[16];
7850     uint8_t src[16], ref[16];
7851     uint64_t error= 0, max_error=0;
7852
7853     for(i=0; i<COUNT; i++){
7854         int j;
7855 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7856         for(j=0; j<16; j++){
7857             ref[j]= random()%255;
7858             src[j]= random()%255;
7859         }
7860
7861         h264_diff_dct_c(block, src, ref, 4);
7862
7863         //normalize
7864         for(j=0; j<16; j++){
7865 //            printf("%d ", block[j]);
7866             block[j]= block[j]*4;
7867             if(j&1) block[j]= (block[j]*4 + 2)/5;
7868             if(j&4) block[j]= (block[j]*4 + 2)/5;
7869         }
7870 //        printf("\n");
7871
7872         s->dsp.h264_idct_add(ref, block, 4);
7873 /*        for(j=0; j<16; j++){
7874             printf("%d ", ref[j]);
7875         }
7876         printf("\n");*/
7877
7878         for(j=0; j<16; j++){
7879             int diff= FFABS(src[j] - ref[j]);
7880
7881             error+= diff*diff;
7882             max_error= FFMAX(max_error, diff);
7883         }
7884     }
7885     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7886     printf("testing quantizer\n");
7887     for(qp=0; qp<52; qp++){
7888         for(i=0; i<16; i++)
7889             src1_block[i]= src2_block[i]= random()%255;
7890
7891     }
7892     printf("Testing NAL layer\n");
7893
7894     uint8_t bitstream[COUNT];
7895     uint8_t nal[COUNT*2];
7896     H264Context h;
7897     memset(&h, 0, sizeof(H264Context));
7898
7899     for(i=0; i<COUNT; i++){
7900         int zeros= i;
7901         int nal_length;
7902         int consumed;
7903         int out_length;
7904         uint8_t *out;
7905         int j;
7906
7907         for(j=0; j<COUNT; j++){
7908             bitstream[j]= (random() % 255) + 1;
7909         }
7910
7911         for(j=0; j<zeros; j++){
7912             int pos= random() % COUNT;
7913             while(bitstream[pos] == 0){
7914                 pos++;
7915                 pos %= COUNT;
7916             }
7917             bitstream[pos]=0;
7918         }
7919
7920         START_TIMER
7921
7922         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7923         if(nal_length<0){
7924             printf("encoding failed\n");
7925             return -1;
7926         }
7927
7928         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7929
7930         STOP_TIMER("NAL")
7931
7932         if(out_length != COUNT){
7933             printf("incorrect length %d %d\n", out_length, COUNT);
7934             return -1;
7935         }
7936
7937         if(consumed != nal_length){
7938             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7939             return -1;
7940         }
7941
7942         if(memcmp(bitstream, out, COUNT)){
7943             printf("mismatch\n");
7944             return -1;
7945         }
7946     }
7947 #endif
7948
7949     printf("Testing RBSP\n");
7950
7951
7952     return 0;
7953 }
7954 #endif /* TEST */
7955
7956
7957 static av_cold int decode_end(AVCodecContext *avctx)
7958 {
7959     H264Context *h = avctx->priv_data;
7960     MpegEncContext *s = &h->s;
7961
7962     av_freep(&h->rbsp_buffer[0]);
7963     av_freep(&h->rbsp_buffer[1]);
7964     free_tables(h); //FIXME cleanup init stuff perhaps
7965     MPV_common_end(s);
7966
7967 //    memset(h, 0, sizeof(H264Context));
7968
7969     return 0;
7970 }
7971
7972
7973 AVCodec h264_decoder = {
7974     "h264",
7975     CODEC_TYPE_VIDEO,
7976     CODEC_ID_H264,
7977     sizeof(H264Context),
7978     decode_init,
7979     NULL,
7980     decode_end,
7981     decode_frame,
7982     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7983     .flush= flush_dpb,
7984     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7985 };
7986
7987 #include "svq3.c"