git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 const uint8_t ff_rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 const uint8_t ff_div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static inline void direct_dist_scale_factor(H264Context * const h){
 898     MpegEncContext * const s = &h->s;
 899     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 900     const int poc1 = h->ref_list[1][0].poc;
 901     int i;
 902     for(i=0; i<h->ref_count[0]; i++){
 903         int poc0 = h->ref_list[0][i].poc;
 904         int td = av_clip(poc1 - poc0, -128, 127);
 905         if(td == 0 || h->ref_list[0][i].long_ref){
 906             h->dist_scale_factor[i] = 256;
 907         }else{
 908             int tb = av_clip(poc - poc0, -128, 127);
 909             int tx = (16384 + (FFABS(td) >> 1)) / td;
 910             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 911         }
 912     }
 913     if(FRAME_MBAFF){
 914         for(i=0; i<h->ref_count[0]; i++){
 915             h->dist_scale_factor_field[2*i] =
 916             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 917         }
 918     }
 919 }
 920 static inline void direct_ref_list_init(H264Context * const h){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     Picture * const cur = s->current_picture_ptr;
 924     int list, i, j;
 925     int sidx= s->picture_structure&1;
 926     int ref1sidx= ref1->reference&1;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[sidx][list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 931     }
 932     if(s->picture_structure == PICT_FRAME){
 933         memcpy(cur->ref_count[0], cur->ref_count[1], sizeof(cur->ref_count[0]));
 934         memcpy(cur->ref_poc  [0], cur->ref_poc  [1], sizeof(cur->ref_poc  [0]));
 935     }
 936     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 937         return;
 938     for(list=0; list<2; list++){
 939         for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 940             int poc = ref1->ref_poc[ref1sidx][list][i];
 941             if(((poc&3) == 3) != (s->picture_structure == PICT_FRAME))
 942                 poc= (poc&~3) + s->picture_structure;
 943             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 944             for(j=0; j<h->ref_count[list]; j++)
 945                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 946                     h->map_col_to_list0[list][i] = j;
 947                     break;
 948                 }
 949         }
 950     }
 951     if(FRAME_MBAFF){
 952         for(list=0; list<2; list++){
 953             for(i=0; i<ref1->ref_count[ref1sidx][list]; i++){
 954                 j = h->map_col_to_list0[list][i];
 955                 h->map_col_to_list0_field[list][2*i] = 2*j;
 956                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 957             }
 958         }
 959     }
 960 }
 961
 962 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 963     MpegEncContext * const s = &h->s;
 964     int b8_stride = h->b8_stride;
 965     int b4_stride = h->b_stride;
 966     int mb_xy = h->mb_xy;
 967     int mb_type_col[2];
 968     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 969     const int8_t *l1ref0, *l1ref1;
 970     const int is_b8x8 = IS_8X8(*mb_type);
 971     unsigned int sub_mb_type;
 972     int i8, i4;
 973
 974 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 975
 976     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 977         if(h->ref_list[1][0].reference == PICT_FRAME){   // AFL/AFR/FR/FL -> AFL
 978             if(!IS_INTERLACED(*mb_type)){                //     AFR/FR    -> AFL
 979                 int cur_poc = s->current_picture_ptr->poc;
 980                 int *col_poc = h->ref_list[1]->field_poc;
 981                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
 982                 mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
 983                 b8_stride = 0;
 984             }
 985         }else if(!(s->picture_structure & h->ref_list[1][0].reference)){// FL -> FL & differ parity
 986             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
 987             mb_xy += s->mb_stride*fieldoff;
 988         }
 989         goto single_col;
 990     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
 991         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
 992             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
 993             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
 994             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
 995             b8_stride *= 3;
 996             b4_stride *= 6;
 997             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
 998             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
 999                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1000                 && !is_b8x8){
1001                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1002                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1003             }else{
1004                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1005                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1006             }
1007         }else{                                           //     AFR/FR    -> AFR/FR
1008 single_col:
1009             mb_type_col[0] =
1010             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1011             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1012                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1013                 * so we know exactly what block size to use */
1014                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1015                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1016             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1017                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1018                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1019             }else{
1020                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1021                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1022             }
1023         }
1024     }
1025
1026     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1027     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1028     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1029     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1030     if(!b8_stride){
1031         if(s->mb_y&1){
1032             l1ref0 += h->b8_stride;
1033             l1ref1 += h->b8_stride;
1034             l1mv0  +=  2*b4_stride;
1035             l1mv1  +=  2*b4_stride;
1036         }
1037     }
1038
1039     if(h->direct_spatial_mv_pred){
1040         int ref[2];
1041         int mv[2][2];
1042         int list;
1043
1044         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1045
1046         /* ref = min(neighbors) */
1047         for(list=0; list<2; list++){
1048             int refa = h->ref_cache[list][scan8[0] - 1];
1049             int refb = h->ref_cache[list][scan8[0] - 8];
1050             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1051             if(refc == PART_NOT_AVAILABLE)
1052                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1053             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1054             if(ref[list] < 0)
1055                 ref[list] = -1;
1056         }
1057
1058         if(ref[0] < 0 && ref[1] < 0){
1059             ref[0] = ref[1] = 0;
1060             mv[0][0] = mv[0][1] =
1061             mv[1][0] = mv[1][1] = 0;
1062         }else{
1063             for(list=0; list<2; list++){
1064                 if(ref[list] >= 0)
1065                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1066                 else
1067                     mv[list][0] = mv[list][1] = 0;
1068             }
1069         }
1070
1071         if(ref[1] < 0){
1072             if(!is_b8x8)
1073                 *mb_type &= ~MB_TYPE_L1;
1074             sub_mb_type &= ~MB_TYPE_L1;
1075         }else if(ref[0] < 0){
1076             if(!is_b8x8)
1077                 *mb_type &= ~MB_TYPE_L0;
1078             sub_mb_type &= ~MB_TYPE_L0;
1079         }
1080
1081         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1082             for(i8=0; i8<4; i8++){
1083                 int x8 = i8&1;
1084                 int y8 = i8>>1;
1085                 int xy8 = x8+y8*b8_stride;
1086                 int xy4 = 3*x8+y8*b4_stride;
1087                 int a=0, b=0;
1088
1089                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1090                     continue;
1091                 h->sub_mb_type[i8] = sub_mb_type;
1092
1093                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1094                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1095                 if(!IS_INTRA(mb_type_col[y8])
1096                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1097                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1098                     if(ref[0] > 0)
1099                         a= pack16to32(mv[0][0],mv[0][1]);
1100                     if(ref[1] > 0)
1101                         b= pack16to32(mv[1][0],mv[1][1]);
1102                 }else{
1103                     a= pack16to32(mv[0][0],mv[0][1]);
1104                     b= pack16to32(mv[1][0],mv[1][1]);
1105                 }
1106                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1107                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1108             }
1109         }else if(IS_16X16(*mb_type)){
1110             int a=0, b=0;
1111
1112             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1113             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1114             if(!IS_INTRA(mb_type_col[0])
1115                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1116                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1117                        && (h->x264_build>33 || !h->x264_build)))){
1118                 if(ref[0] > 0)
1119                     a= pack16to32(mv[0][0],mv[0][1]);
1120                 if(ref[1] > 0)
1121                     b= pack16to32(mv[1][0],mv[1][1]);
1122             }else{
1123                 a= pack16to32(mv[0][0],mv[0][1]);
1124                 b= pack16to32(mv[1][0],mv[1][1]);
1125             }
1126             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1127             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1128         }else{
1129             for(i8=0; i8<4; i8++){
1130                 const int x8 = i8&1;
1131                 const int y8 = i8>>1;
1132
1133                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1134                     continue;
1135                 h->sub_mb_type[i8] = sub_mb_type;
1136
1137                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1138                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1139                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1140                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1141
1142                 /* col_zero_flag */
1143                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1144                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1145                                                   && (h->x264_build>33 || !h->x264_build)))){
1146                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1147                     if(IS_SUB_8X8(sub_mb_type)){
1148                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1149                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1150                             if(ref[0] == 0)
1151                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1152                             if(ref[1] == 0)
1153                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1154                         }
1155                     }else
1156                     for(i4=0; i4<4; i4++){
1157                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1158                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1159                             if(ref[0] == 0)
1160                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1161                             if(ref[1] == 0)
1162                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1163                         }
1164                     }
1165                 }
1166             }
1167         }
1168     }else{ /* direct temporal mv pred */
1169         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1170         const int *dist_scale_factor = h->dist_scale_factor;
1171
1172         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1173             map_col_to_list0[0] = h->map_col_to_list0_field[0];
1174             map_col_to_list0[1] = h->map_col_to_list0_field[1];
1175             dist_scale_factor = h->dist_scale_factor_field;
1176         }
1177         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1178             /* FIXME assumes direct_8x8_inference == 1 */
1179             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1180             int ref_shift= FRAME_MBAFF ? y_shift : 1;
1181
1182             for(i8=0; i8<4; i8++){
1183                 const int x8 = i8&1;
1184                 const int y8 = i8>>1;
1185                 int ref0, scale;
1186                 const int16_t (*l1mv)[2]= l1mv0;
1187
1188                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1189                     continue;
1190                 h->sub_mb_type[i8] = sub_mb_type;
1191
1192                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1193                 if(IS_INTRA(mb_type_col[y8])){
1194                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1195                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1196                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1197                     continue;
1198                 }
1199
1200                 ref0 = l1ref0[x8 + y8*b8_stride];
1201                 if(ref0 >= 0)
1202                     ref0 = map_col_to_list0[0][ref0*2>>ref_shift];
1203                 else{
1204                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride]*2>>ref_shift];
1205                     l1mv= l1mv1;
1206                 }
1207                 scale = dist_scale_factor[ref0];
1208                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1209
1210                 {
1211                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1212                     int my_col = (mv_col[1]<<y_shift)/2;
1213                     int mx = (scale * mv_col[0] + 128) >> 8;
1214                     int my = (scale * my_col + 128) >> 8;
1215                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1216                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1217                 }
1218             }
1219             return;
1220         }
1221
1222         /* one-to-one mv scaling */
1223
1224         if(IS_16X16(*mb_type)){
1225             int ref, mv0, mv1;
1226
1227             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1228             if(IS_INTRA(mb_type_col[0])){
1229                 ref=mv0=mv1=0;
1230             }else{
1231                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1232                                                 : map_col_to_list0[1][l1ref1[0]];
1233                 const int scale = dist_scale_factor[ref0];
1234                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1235                 int mv_l0[2];
1236                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1237                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1238                 ref= ref0;
1239                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1240                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1241             }
1242             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1243             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1244             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1245         }else{
1246             for(i8=0; i8<4; i8++){
1247                 const int x8 = i8&1;
1248                 const int y8 = i8>>1;
1249                 int ref0, scale;
1250                 const int16_t (*l1mv)[2]= l1mv0;
1251
1252                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1253                     continue;
1254                 h->sub_mb_type[i8] = sub_mb_type;
1255                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1256                 if(IS_INTRA(mb_type_col[0])){
1257                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1258                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1259                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1260                     continue;
1261                 }
1262
1263                 ref0 = l1ref0[x8 + y8*b8_stride];
1264                 if(ref0 >= 0)
1265                     ref0 = map_col_to_list0[0][ref0];
1266                 else{
1267                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride]];
1268                     l1mv= l1mv1;
1269                 }
1270                 scale = dist_scale_factor[ref0];
1271
1272                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1273                 if(IS_SUB_8X8(sub_mb_type)){
1274                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1275                     int mx = (scale * mv_col[0] + 128) >> 8;
1276                     int my = (scale * mv_col[1] + 128) >> 8;
1277                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1278                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1279                 }else
1280                 for(i4=0; i4<4; i4++){
1281                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1282                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1283                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1284                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1285                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1286                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1287                 }
1288             }
1289         }
1290     }
1291 }
1292
1293 static inline void write_back_motion(H264Context *h, int mb_type){
1294     MpegEncContext * const s = &h->s;
1295     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1296     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1297     int list;
1298
1299     if(!USES_LIST(mb_type, 0))
1300         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1301
1302     for(list=0; list<h->list_count; list++){
1303         int y;
1304         if(!USES_LIST(mb_type, list))
1305             continue;
1306
1307         for(y=0; y<4; y++){
1308             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1309             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1310         }
1311         if( h->pps.cabac ) {
1312             if(IS_SKIP(mb_type))
1313                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1314             else
1315             for(y=0; y<4; y++){
1316                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1317                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1318             }
1319         }
1320
1321         {
1322             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1323             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1324             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1325             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1326             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1327         }
1328     }
1329
1330     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1331         if(IS_8X8(mb_type)){
1332             uint8_t *direct_table = &h->direct_table[b8_xy];
1333             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1334             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1335             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1336         }
1337     }
1338 }
1339
1340 /**
1341  * Decodes a network abstraction layer unit.
1342  * @param consumed is the number of bytes used as input
1343  * @param length is the length of the array
1344  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1345  * @returns decoded bytes, might be src+1 if no escapes
1346  */
1347 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1348     int i, si, di;
1349     uint8_t *dst;
1350     int bufidx;
1351
1352 //    src[0]&0x80;                //forbidden bit
1353     h->nal_ref_idc= src[0]>>5;
1354     h->nal_unit_type= src[0]&0x1F;
1355
1356     src++; length--;
1357 #if 0
1358     for(i=0; i<length; i++)
1359         printf("%2X ", src[i]);
1360 #endif
1361     for(i=0; i+1<length; i+=2){
1362         if(src[i]) continue;
1363         if(i>0 && src[i-1]==0) i--;
1364         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1365             if(src[i+2]!=3){
1366                 /* startcode, so we must be past the end */
1367                 length=i;
1368             }
1369             break;
1370         }
1371     }
1372
1373     if(i>=length-1){ //no escaped 0
1374         *dst_length= length;
1375         *consumed= length+1; //+1 for the header
1376         return src;
1377     }
1378
1379     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1380     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1381     dst= h->rbsp_buffer[bufidx];
1382
1383     if (dst == NULL){
1384         return NULL;
1385     }
1386
1387 //printf("decoding esc\n");
1388     si=di=0;
1389     while(si<length){
1390         //remove escapes (very rare 1:2^22)
1391         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1392             if(src[si+2]==3){ //escape
1393                 dst[di++]= 0;
1394                 dst[di++]= 0;
1395                 si+=3;
1396                 continue;
1397             }else //next start code
1398                 break;
1399         }
1400
1401         dst[di++]= src[si++];
1402     }
1403
1404     *dst_length= di;
1405     *consumed= si + 1;//+1 for the header
1406 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1407     return dst;
1408 }
1409
1410 /**
1411  * identifies the exact end of the bitstream
1412  * @return the length of the trailing, or 0 if damaged
1413  */
1414 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1415     int v= *src;
1416     int r;
1417
1418     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1419
1420     for(r=1; r<9; r++){
1421         if(v&1) return r;
1422         v>>=1;
1423     }
1424     return 0;
1425 }
1426
1427 /**
1428  * IDCT transforms the 16 dc values and dequantizes them.
1429  * @param qp quantization parameter
1430  */
1431 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1432 #define stride 16
1433     int i;
1434     int temp[16]; //FIXME check if this is a good idea
1435     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1436     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1437
1438 //memset(block, 64, 2*256);
1439 //return;
1440     for(i=0; i<4; i++){
1441         const int offset= y_offset[i];
1442         const int z0= block[offset+stride*0] + block[offset+stride*4];
1443         const int z1= block[offset+stride*0] - block[offset+stride*4];
1444         const int z2= block[offset+stride*1] - block[offset+stride*5];
1445         const int z3= block[offset+stride*1] + block[offset+stride*5];
1446
1447         temp[4*i+0]= z0+z3;
1448         temp[4*i+1]= z1+z2;
1449         temp[4*i+2]= z1-z2;
1450         temp[4*i+3]= z0-z3;
1451     }
1452
1453     for(i=0; i<4; i++){
1454         const int offset= x_offset[i];
1455         const int z0= temp[4*0+i] + temp[4*2+i];
1456         const int z1= temp[4*0+i] - temp[4*2+i];
1457         const int z2= temp[4*1+i] - temp[4*3+i];
1458         const int z3= temp[4*1+i] + temp[4*3+i];
1459
1460         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1461         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1462         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1463         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1464     }
1465 }
1466
1467 #if 0
1468 /**
1469  * DCT transforms the 16 dc values.
1470  * @param qp quantization parameter ??? FIXME
1471  */
1472 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1473 //    const int qmul= dequant_coeff[qp][0];
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479     for(i=0; i<4; i++){
1480         const int offset= y_offset[i];
1481         const int z0= block[offset+stride*0] + block[offset+stride*4];
1482         const int z1= block[offset+stride*0] - block[offset+stride*4];
1483         const int z2= block[offset+stride*1] - block[offset+stride*5];
1484         const int z3= block[offset+stride*1] + block[offset+stride*5];
1485
1486         temp[4*i+0]= z0+z3;
1487         temp[4*i+1]= z1+z2;
1488         temp[4*i+2]= z1-z2;
1489         temp[4*i+3]= z0-z3;
1490     }
1491
1492     for(i=0; i<4; i++){
1493         const int offset= x_offset[i];
1494         const int z0= temp[4*0+i] + temp[4*2+i];
1495         const int z1= temp[4*0+i] - temp[4*2+i];
1496         const int z2= temp[4*1+i] - temp[4*3+i];
1497         const int z3= temp[4*1+i] + temp[4*3+i];
1498
1499         block[stride*0 +offset]= (z0 + z3)>>1;
1500         block[stride*2 +offset]= (z1 + z2)>>1;
1501         block[stride*8 +offset]= (z1 - z2)>>1;
1502         block[stride*10+offset]= (z0 - z3)>>1;
1503     }
1504 }
1505 #endif
1506
1507 #undef xStride
1508 #undef stride
1509
1510 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1511     const int stride= 16*2;
1512     const int xStride= 16;
1513     int a,b,c,d,e;
1514
1515     a= block[stride*0 + xStride*0];
1516     b= block[stride*0 + xStride*1];
1517     c= block[stride*1 + xStride*0];
1518     d= block[stride*1 + xStride*1];
1519
1520     e= a-b;
1521     a= a+b;
1522     b= c-d;
1523     c= c+d;
1524
1525     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1526     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1527     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1528     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1529 }
1530
1531 #if 0
1532 static void chroma_dc_dct_c(DCTELEM *block){
1533     const int stride= 16*2;
1534     const int xStride= 16;
1535     int a,b,c,d,e;
1536
1537     a= block[stride*0 + xStride*0];
1538     b= block[stride*0 + xStride*1];
1539     c= block[stride*1 + xStride*0];
1540     d= block[stride*1 + xStride*1];
1541
1542     e= a-b;
1543     a= a+b;
1544     b= c-d;
1545     c= c+d;
1546
1547     block[stride*0 + xStride*0]= (a+c);
1548     block[stride*0 + xStride*1]= (e+b);
1549     block[stride*1 + xStride*0]= (a-c);
1550     block[stride*1 + xStride*1]= (e-b);
1551 }
1552 #endif
1553
1554 /**
1555  * gets the chroma qp.
1556  */
1557 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1558     return h->pps.chroma_qp_table[t][qscale];
1559 }
1560
1561 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1562 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1563 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1564     int i;
1565     const int * const quant_table= quant_coeff[qscale];
1566     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1567     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1568     const unsigned int threshold2= (threshold1<<1);
1569     int last_non_zero;
1570
1571     if(separate_dc){
1572         if(qscale<=18){
1573             //avoid overflows
1574             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1575             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1576             const unsigned int dc_threshold2= (dc_threshold1<<1);
1577
1578             int level= block[0]*quant_coeff[qscale+18][0];
1579             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1580                 if(level>0){
1581                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1582                     block[0]= level;
1583                 }else{
1584                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1585                     block[0]= -level;
1586                 }
1587 //                last_non_zero = i;
1588             }else{
1589                 block[0]=0;
1590             }
1591         }else{
1592             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1593             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1594             const unsigned int dc_threshold2= (dc_threshold1<<1);
1595
1596             int level= block[0]*quant_table[0];
1597             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1598                 if(level>0){
1599                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1600                     block[0]= level;
1601                 }else{
1602                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1603                     block[0]= -level;
1604                 }
1605 //                last_non_zero = i;
1606             }else{
1607                 block[0]=0;
1608             }
1609         }
1610         last_non_zero= 0;
1611         i=1;
1612     }else{
1613         last_non_zero= -1;
1614         i=0;
1615     }
1616
1617     for(; i<16; i++){
1618         const int j= scantable[i];
1619         int level= block[j]*quant_table[j];
1620
1621 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1622 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1623         if(((unsigned)(level+threshold1))>threshold2){
1624             if(level>0){
1625                 level= (bias + level)>>QUANT_SHIFT;
1626                 block[j]= level;
1627             }else{
1628                 level= (bias - level)>>QUANT_SHIFT;
1629                 block[j]= -level;
1630             }
1631             last_non_zero = i;
1632         }else{
1633             block[j]=0;
1634         }
1635     }
1636
1637     return last_non_zero;
1638 }
1639
1640 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1641                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1642                            int src_x_offset, int src_y_offset,
1643                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1644     MpegEncContext * const s = &h->s;
1645     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1646     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1647     const int luma_xy= (mx&3) + ((my&3)<<2);
1648     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1649     uint8_t * src_cb, * src_cr;
1650     int extra_width= h->emu_edge_width;
1651     int extra_height= h->emu_edge_height;
1652     int emu=0;
1653     const int full_mx= mx>>2;
1654     const int full_my= my>>2;
1655     const int pic_width  = 16*s->mb_width;
1656     const int pic_height = 16*s->mb_height >> MB_FIELD;
1657
1658     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1659         return;
1660
1661     if(mx&7) extra_width -= 3;
1662     if(my&7) extra_height -= 3;
1663
1664     if(   full_mx < 0-extra_width
1665        || full_my < 0-extra_height
1666        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1667        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1668         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1669             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1670         emu=1;
1671     }
1672
1673     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1674     if(!square){
1675         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1676     }
1677
1678     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1679
1680     if(MB_FIELD){
1681         // chroma offset when predicting from a field of opposite parity
1682         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1683         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1684     }
1685     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1686     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1687
1688     if(emu){
1689         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1690             src_cb= s->edge_emu_buffer;
1691     }
1692     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1693
1694     if(emu){
1695         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1696             src_cr= s->edge_emu_buffer;
1697     }
1698     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1699 }
1700
1701 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1702                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1703                            int x_offset, int y_offset,
1704                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1705                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1706                            int list0, int list1){
1707     MpegEncContext * const s = &h->s;
1708     qpel_mc_func *qpix_op=  qpix_put;
1709     h264_chroma_mc_func chroma_op= chroma_put;
1710
1711     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1712     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1713     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1714     x_offset += 8*s->mb_x;
1715     y_offset += 8*(s->mb_y >> MB_FIELD);
1716
1717     if(list0){
1718         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1719         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1720                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1721                            qpix_op, chroma_op);
1722
1723         qpix_op=  qpix_avg;
1724         chroma_op= chroma_avg;
1725     }
1726
1727     if(list1){
1728         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1729         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1730                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1731                            qpix_op, chroma_op);
1732     }
1733 }
1734
1735 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1736                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1737                            int x_offset, int y_offset,
1738                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1739                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1740                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1741                            int list0, int list1){
1742     MpegEncContext * const s = &h->s;
1743
1744     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1745     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1746     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1747     x_offset += 8*s->mb_x;
1748     y_offset += 8*(s->mb_y >> MB_FIELD);
1749
1750     if(list0 && list1){
1751         /* don't optimize for luma-only case, since B-frames usually
1752          * use implicit weights => chroma too. */
1753         uint8_t *tmp_cb = s->obmc_scratchpad;
1754         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1755         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1756         int refn0 = h->ref_cache[0][ scan8[n] ];
1757         int refn1 = h->ref_cache[1][ scan8[n] ];
1758
1759         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1760                     dest_y, dest_cb, dest_cr,
1761                     x_offset, y_offset, qpix_put, chroma_put);
1762         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1763                     tmp_y, tmp_cb, tmp_cr,
1764                     x_offset, y_offset, qpix_put, chroma_put);
1765
1766         if(h->use_weight == 2){
1767             int weight0 = h->implicit_weight[refn0][refn1];
1768             int weight1 = 64 - weight0;
1769             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1770             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1771             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1772         }else{
1773             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1774                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1775                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1776             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1777                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1778                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1779             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1780                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1781                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1782         }
1783     }else{
1784         int list = list1 ? 1 : 0;
1785         int refn = h->ref_cache[list][ scan8[n] ];
1786         Picture *ref= &h->ref_list[list][refn];
1787         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1788                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1789                     qpix_put, chroma_put);
1790
1791         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1792                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1793         if(h->use_weight_chroma){
1794             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1795                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1796             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1797                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1798         }
1799     }
1800 }
1801
1802 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1803                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1804                            int x_offset, int y_offset,
1805                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1806                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1807                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1808                            int list0, int list1){
1809     if((h->use_weight==2 && list0 && list1
1810         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1811        || h->use_weight==1)
1812         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1813                          x_offset, y_offset, qpix_put, chroma_put,
1814                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1815     else
1816         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1817                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1818 }
1819
1820 static inline void prefetch_motion(H264Context *h, int list){
1821     /* fetch pixels for estimated mv 4 macroblocks ahead
1822      * optimized for 64byte cache lines */
1823     MpegEncContext * const s = &h->s;
1824     const int refn = h->ref_cache[list][scan8[0]];
1825     if(refn >= 0){
1826         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1827         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1828         uint8_t **src= h->ref_list[list][refn].data;
1829         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1830         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1831         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1832         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1833     }
1834 }
1835
1836 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1837                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1838                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1839                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1840     MpegEncContext * const s = &h->s;
1841     const int mb_xy= h->mb_xy;
1842     const int mb_type= s->current_picture.mb_type[mb_xy];
1843
1844     assert(IS_INTER(mb_type));
1845
1846     prefetch_motion(h, 0);
1847
1848     if(IS_16X16(mb_type)){
1849         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1850                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1851                 &weight_op[0], &weight_avg[0],
1852                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1853     }else if(IS_16X8(mb_type)){
1854         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1855                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1856                 &weight_op[1], &weight_avg[1],
1857                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1858         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1859                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1860                 &weight_op[1], &weight_avg[1],
1861                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1862     }else if(IS_8X16(mb_type)){
1863         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1864                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1865                 &weight_op[2], &weight_avg[2],
1866                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1867         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1868                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1869                 &weight_op[2], &weight_avg[2],
1870                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1871     }else{
1872         int i;
1873
1874         assert(IS_8X8(mb_type));
1875
1876         for(i=0; i<4; i++){
1877             const int sub_mb_type= h->sub_mb_type[i];
1878             const int n= 4*i;
1879             int x_offset= (i&1)<<2;
1880             int y_offset= (i&2)<<1;
1881
1882             if(IS_SUB_8X8(sub_mb_type)){
1883                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1884                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1885                     &weight_op[3], &weight_avg[3],
1886                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1887             }else if(IS_SUB_8X4(sub_mb_type)){
1888                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1889                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1890                     &weight_op[4], &weight_avg[4],
1891                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1892                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1893                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1894                     &weight_op[4], &weight_avg[4],
1895                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1896             }else if(IS_SUB_4X8(sub_mb_type)){
1897                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1898                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1899                     &weight_op[5], &weight_avg[5],
1900                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1901                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1902                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1903                     &weight_op[5], &weight_avg[5],
1904                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1905             }else{
1906                 int j;
1907                 assert(IS_SUB_4X4(sub_mb_type));
1908                 for(j=0; j<4; j++){
1909                     int sub_x_offset= x_offset + 2*(j&1);
1910                     int sub_y_offset= y_offset +   (j&2);
1911                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1912                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1913                         &weight_op[6], &weight_avg[6],
1914                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1915                 }
1916             }
1917         }
1918     }
1919
1920     prefetch_motion(h, 1);
1921 }
1922
1923 static av_cold void decode_init_vlc(void){
1924     static int done = 0;
1925
1926     if (!done) {
1927         int i;
1928         int offset;
1929         done = 1;
1930
1931         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1932         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1933         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1934                  &chroma_dc_coeff_token_len [0], 1, 1,
1935                  &chroma_dc_coeff_token_bits[0], 1, 1,
1936                  INIT_VLC_USE_NEW_STATIC);
1937
1938         offset = 0;
1939         for(i=0; i<4; i++){
1940             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1941             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1942             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1943                      &coeff_token_len [i][0], 1, 1,
1944                      &coeff_token_bits[i][0], 1, 1,
1945                      INIT_VLC_USE_NEW_STATIC);
1946             offset += coeff_token_vlc_tables_size[i];
1947         }
1948         /*
1949          * This is a one time safety check to make sure that
1950          * the packed static coeff_token_vlc table sizes
1951          * were initialized correctly.
1952          */
1953         assert(offset == sizeof(coeff_token_vlc_tables)/(sizeof(VLC_TYPE)*2));
1954
1955         for(i=0; i<3; i++){
1956             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1957             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1958             init_vlc(&chroma_dc_total_zeros_vlc[i],
1959                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1960                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1961                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1962                      INIT_VLC_USE_NEW_STATIC);
1963         }
1964         for(i=0; i<15; i++){
1965             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1966             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1967             init_vlc(&total_zeros_vlc[i],
1968                      TOTAL_ZEROS_VLC_BITS, 16,
1969                      &total_zeros_len [i][0], 1, 1,
1970                      &total_zeros_bits[i][0], 1, 1,
1971                      INIT_VLC_USE_NEW_STATIC);
1972         }
1973
1974         for(i=0; i<6; i++){
1975             run_vlc[i].table = run_vlc_tables[i];
1976             run_vlc[i].table_allocated = run_vlc_tables_size;
1977             init_vlc(&run_vlc[i],
1978                      RUN_VLC_BITS, 7,
1979                      &run_len [i][0], 1, 1,
1980                      &run_bits[i][0], 1, 1,
1981                      INIT_VLC_USE_NEW_STATIC);
1982         }
1983         run7_vlc.table = run7_vlc_table,
1984         run7_vlc.table_allocated = run7_vlc_table_size;
1985         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1986                  &run_len [6][0], 1, 1,
1987                  &run_bits[6][0], 1, 1,
1988                  INIT_VLC_USE_NEW_STATIC);
1989     }
1990 }
1991
1992 static void free_tables(H264Context *h){
1993     int i;
1994     H264Context *hx;
1995     av_freep(&h->intra4x4_pred_mode);
1996     av_freep(&h->chroma_pred_mode_table);
1997     av_freep(&h->cbp_table);
1998     av_freep(&h->mvd_table[0]);
1999     av_freep(&h->mvd_table[1]);
2000     av_freep(&h->direct_table);
2001     av_freep(&h->non_zero_count);
2002     av_freep(&h->slice_table_base);
2003     h->slice_table= NULL;
2004
2005     av_freep(&h->mb2b_xy);
2006     av_freep(&h->mb2b8_xy);
2007
2008     for(i = 0; i < MAX_SPS_COUNT; i++)
2009         av_freep(h->sps_buffers + i);
2010
2011     for(i = 0; i < MAX_PPS_COUNT; i++)
2012         av_freep(h->pps_buffers + i);
2013
2014     for(i = 0; i < h->s.avctx->thread_count; i++) {
2015         hx = h->thread_context[i];
2016         if(!hx) continue;
2017         av_freep(&hx->top_borders[1]);
2018         av_freep(&hx->top_borders[0]);
2019         av_freep(&hx->s.obmc_scratchpad);
2020     }
2021 }
2022
2023 static void init_dequant8_coeff_table(H264Context *h){
2024     int i,q,x;
2025     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2026     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2027     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2028
2029     for(i=0; i<2; i++ ){
2030         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2031             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2032             break;
2033         }
2034
2035         for(q=0; q<52; q++){
2036             int shift = ff_div6[q];
2037             int idx = ff_rem6[q];
2038             for(x=0; x<64; x++)
2039                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2040                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2041                     h->pps.scaling_matrix8[i][x]) << shift;
2042         }
2043     }
2044 }
2045
2046 static void init_dequant4_coeff_table(H264Context *h){
2047     int i,j,q,x;
2048     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2049     for(i=0; i<6; i++ ){
2050         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2051         for(j=0; j<i; j++){
2052             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2053                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2054                 break;
2055             }
2056         }
2057         if(j<i)
2058             continue;
2059
2060         for(q=0; q<52; q++){
2061             int shift = ff_div6[q] + 2;
2062             int idx = ff_rem6[q];
2063             for(x=0; x<16; x++)
2064                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2065                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2066                     h->pps.scaling_matrix4[i][x]) << shift;
2067         }
2068     }
2069 }
2070
2071 static void init_dequant_tables(H264Context *h){
2072     int i,x;
2073     init_dequant4_coeff_table(h);
2074     if(h->pps.transform_8x8_mode)
2075         init_dequant8_coeff_table(h);
2076     if(h->sps.transform_bypass){
2077         for(i=0; i<6; i++)
2078             for(x=0; x<16; x++)
2079                 h->dequant4_coeff[i][0][x] = 1<<6;
2080         if(h->pps.transform_8x8_mode)
2081             for(i=0; i<2; i++)
2082                 for(x=0; x<64; x++)
2083                     h->dequant8_coeff[i][0][x] = 1<<6;
2084     }
2085 }
2086
2087
2088 /**
2089  * allocates tables.
2090  * needs width/height
2091  */
2092 static int alloc_tables(H264Context *h){
2093     MpegEncContext * const s = &h->s;
2094     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2095     int x,y;
2096
2097     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2098
2099     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2102
2103     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2106     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2107
2108     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2109     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2110
2111     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2112     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2113     for(y=0; y<s->mb_height; y++){
2114         for(x=0; x<s->mb_width; x++){
2115             const int mb_xy= x + y*s->mb_stride;
2116             const int b_xy = 4*x + 4*y*h->b_stride;
2117             const int b8_xy= 2*x + 2*y*h->b8_stride;
2118
2119             h->mb2b_xy [mb_xy]= b_xy;
2120             h->mb2b8_xy[mb_xy]= b8_xy;
2121         }
2122     }
2123
2124     s->obmc_scratchpad = NULL;
2125
2126     if(!h->dequant4_coeff[0])
2127         init_dequant_tables(h);
2128
2129     return 0;
2130 fail:
2131     free_tables(h);
2132     return -1;
2133 }
2134
2135 /**
2136  * Mimic alloc_tables(), but for every context thread.
2137  */
2138 static void clone_tables(H264Context *dst, H264Context *src){
2139     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2140     dst->non_zero_count           = src->non_zero_count;
2141     dst->slice_table              = src->slice_table;
2142     dst->cbp_table                = src->cbp_table;
2143     dst->mb2b_xy                  = src->mb2b_xy;
2144     dst->mb2b8_xy                 = src->mb2b8_xy;
2145     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2146     dst->mvd_table[0]             = src->mvd_table[0];
2147     dst->mvd_table[1]             = src->mvd_table[1];
2148     dst->direct_table             = src->direct_table;
2149
2150     dst->s.obmc_scratchpad = NULL;
2151     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2152 }
2153
2154 /**
2155  * Init context
2156  * Allocate buffers which are not shared amongst multiple threads.
2157  */
2158 static int context_init(H264Context *h){
2159     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2161
2162     return 0;
2163 fail:
2164     return -1; // free_tables will clean up for us
2165 }
2166
2167 static av_cold void common_init(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169
2170     s->width = s->avctx->width;
2171     s->height = s->avctx->height;
2172     s->codec_id= s->avctx->codec->id;
2173
2174     ff_h264_pred_init(&h->hpc, s->codec_id);
2175
2176     h->dequant_coeff_pps= -1;
2177     s->unrestricted_mv=1;
2178     s->decode=1; //FIXME
2179
2180     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2181     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2182 }
2183
2184 static av_cold int decode_init(AVCodecContext *avctx){
2185     H264Context *h= avctx->priv_data;
2186     MpegEncContext * const s = &h->s;
2187
2188     MPV_decode_defaults(s);
2189
2190     s->avctx = avctx;
2191     common_init(h);
2192
2193     s->out_format = FMT_H264;
2194     s->workaround_bugs= avctx->workaround_bugs;
2195
2196     // set defaults
2197 //    s->decode_mb= ff_h263_decode_mb;
2198     s->quarter_sample = 1;
2199     s->low_delay= 1;
2200
2201     if(avctx->codec_id == CODEC_ID_SVQ3)
2202         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2203     else
2204         avctx->pix_fmt= PIX_FMT_YUV420P;
2205
2206     decode_init_vlc();
2207
2208     if(avctx->extradata_size > 0 && avctx->extradata &&
2209        *(char *)avctx->extradata == 1){
2210         h->is_avc = 1;
2211         h->got_avcC = 0;
2212     } else {
2213         h->is_avc = 0;
2214     }
2215
2216     h->thread_context[0] = h;
2217     h->outputed_poc = INT_MIN;
2218     return 0;
2219 }
2220
2221 static int frame_start(H264Context *h){
2222     MpegEncContext * const s = &h->s;
2223     int i;
2224
2225     if(MPV_frame_start(s, s->avctx) < 0)
2226         return -1;
2227     ff_er_frame_start(s);
2228     /*
2229      * MPV_frame_start uses pict_type to derive key_frame.
2230      * This is incorrect for H.264; IDR markings must be used.
2231      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2232      * See decode_nal_units().
2233      */
2234     s->current_picture_ptr->key_frame= 0;
2235
2236     assert(s->linesize && s->uvlinesize);
2237
2238     for(i=0; i<16; i++){
2239         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2240         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2241     }
2242     for(i=0; i<4; i++){
2243         h->block_offset[16+i]=
2244         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245         h->block_offset[24+16+i]=
2246         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2247     }
2248
2249     /* can't be in alloc_tables because linesize isn't known there.
2250      * FIXME: redo bipred weight to not require extra buffer? */
2251     for(i = 0; i < s->avctx->thread_count; i++)
2252         if(!h->thread_context[i]->s.obmc_scratchpad)
2253             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2254
2255     /* some macroblocks will be accessed before they're available */
2256     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2257         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2258
2259 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2260
2261     // We mark the current picture as non-reference after allocating it, so
2262     // that if we break out due to an error it can be released automatically
2263     // in the next MPV_frame_start().
2264     // SVQ3 as well as most other codecs have only last/next/current and thus
2265     // get released even with set reference, besides SVQ3 and others do not
2266     // mark frames as reference later "naturally".
2267     if(s->codec_id != CODEC_ID_SVQ3)
2268         s->current_picture_ptr->reference= 0;
2269
2270     s->current_picture_ptr->field_poc[0]=
2271     s->current_picture_ptr->field_poc[1]= INT_MAX;
2272     assert(s->current_picture_ptr->long_ref==0);
2273
2274     return 0;
2275 }
2276
2277 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2278     MpegEncContext * const s = &h->s;
2279     int i;
2280     int step    = 1;
2281     int offset  = 1;
2282     int uvoffset= 1;
2283     int top_idx = 1;
2284     int skiplast= 0;
2285
2286     src_y  -=   linesize;
2287     src_cb -= uvlinesize;
2288     src_cr -= uvlinesize;
2289
2290     if(!simple && FRAME_MBAFF){
2291         if(s->mb_y&1){
2292             offset  = MB_MBAFF ? 1 : 17;
2293             uvoffset= MB_MBAFF ? 1 : 9;
2294             if(!MB_MBAFF){
2295                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2296                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2297                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2298                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2299                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2300                 }
2301             }
2302         }else{
2303             if(!MB_MBAFF){
2304                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2305                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2306                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2307                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2308                 }
2309                 skiplast= 1;
2310             }
2311             offset  =
2312             uvoffset=
2313             top_idx = MB_MBAFF ? 0 : 1;
2314         }
2315         step= MB_MBAFF ? 2 : 1;
2316     }
2317
2318     // There are two lines saved, the line above the the top macroblock of a pair,
2319     // and the line above the bottom macroblock
2320     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2321     for(i=1; i<17 - skiplast; i++){
2322         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2323     }
2324
2325     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2326     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2327
2328     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2329         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2330         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2331         for(i=1; i<9 - skiplast; i++){
2332             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2333             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2334         }
2335         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2336         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2337     }
2338 }
2339
2340 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2341     MpegEncContext * const s = &h->s;
2342     int temp8, i;
2343     uint64_t temp64;
2344     int deblock_left;
2345     int deblock_top;
2346     int mb_xy;
2347     int step    = 1;
2348     int offset  = 1;
2349     int uvoffset= 1;
2350     int top_idx = 1;
2351
2352     if(!simple && FRAME_MBAFF){
2353         if(s->mb_y&1){
2354             offset  = MB_MBAFF ? 1 : 17;
2355             uvoffset= MB_MBAFF ? 1 : 9;
2356         }else{
2357             offset  =
2358             uvoffset=
2359             top_idx = MB_MBAFF ? 0 : 1;
2360         }
2361         step= MB_MBAFF ? 2 : 1;
2362     }
2363
2364     if(h->deblocking_filter == 2) {
2365         mb_xy = h->mb_xy;
2366         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2367         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2368     } else {
2369         deblock_left = (s->mb_x > 0);
2370         deblock_top =  (s->mb_y > 0);
2371     }
2372
2373     src_y  -=   linesize + 1;
2374     src_cb -= uvlinesize + 1;
2375     src_cr -= uvlinesize + 1;
2376
2377 #define XCHG(a,b,t,xchg)\
2378 t= a;\
2379 if(xchg)\
2380     a= b;\
2381 b= t;
2382
2383     if(deblock_left){
2384         for(i = !deblock_top; i<16; i++){
2385             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2386         }
2387         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2388     }
2389
2390     if(deblock_top){
2391         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2392         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2393         if(s->mb_x+1 < s->mb_width){
2394             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2395         }
2396     }
2397
2398     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2399         if(deblock_left){
2400             for(i = !deblock_top; i<8; i++){
2401                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2402                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2403             }
2404             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2405             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2406         }
2407         if(deblock_top){
2408             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2409             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2410         }
2411     }
2412 }
2413
2414 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2415     MpegEncContext * const s = &h->s;
2416     const int mb_x= s->mb_x;
2417     const int mb_y= s->mb_y;
2418     const int mb_xy= h->mb_xy;
2419     const int mb_type= s->current_picture.mb_type[mb_xy];
2420     uint8_t  *dest_y, *dest_cb, *dest_cr;
2421     int linesize, uvlinesize /*dct_offset*/;
2422     int i;
2423     int *block_offset = &h->block_offset[0];
2424     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2425     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2426     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2427
2428     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2429     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2430     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2431
2432     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2433     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2434
2435     if (!simple && MB_FIELD) {
2436         linesize   = h->mb_linesize   = s->linesize * 2;
2437         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2438         block_offset = &h->block_offset[24];
2439         if(mb_y&1){ //FIXME move out of this function?
2440             dest_y -= s->linesize*15;
2441             dest_cb-= s->uvlinesize*7;
2442             dest_cr-= s->uvlinesize*7;
2443         }
2444         if(FRAME_MBAFF) {
2445             int list;
2446             for(list=0; list<h->list_count; list++){
2447                 if(!USES_LIST(mb_type, list))
2448                     continue;
2449                 if(IS_16X16(mb_type)){
2450                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2451                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2452                 }else{
2453                     for(i=0; i<16; i+=4){
2454                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2455                         int ref = h->ref_cache[list][scan8[i]];
2456                         if(ref >= 0)
2457                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2458                     }
2459                 }
2460             }
2461         }
2462     } else {
2463         linesize   = h->mb_linesize   = s->linesize;
2464         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2465 //        dct_offset = s->linesize * 16;
2466     }
2467
2468     if(transform_bypass){
2469         idct_dc_add =
2470         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2471     }else if(IS_8x8DCT(mb_type)){
2472         idct_dc_add = s->dsp.h264_idct8_dc_add;
2473         idct_add = s->dsp.h264_idct8_add;
2474     }else{
2475         idct_dc_add = s->dsp.h264_idct_dc_add;
2476         idct_add = s->dsp.h264_idct_add;
2477     }
2478
2479     if (!simple && IS_INTRA_PCM(mb_type)) {
2480         for (i=0; i<16; i++) {
2481             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2482         }
2483         for (i=0; i<8; i++) {
2484             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2485             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2486         }
2487     } else {
2488         if(IS_INTRA(mb_type)){
2489             if(h->deblocking_filter)
2490                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2491
2492             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2493                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2494                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2495             }
2496
2497             if(IS_INTRA4x4(mb_type)){
2498                 if(simple || !s->encoding){
2499                     if(IS_8x8DCT(mb_type)){
2500                         for(i=0; i<16; i+=4){
2501                             uint8_t * const ptr= dest_y + block_offset[i];
2502                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2503                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2504                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2505                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2506                             if(nnz){
2507                                 if(nnz == 1 && h->mb[i*16])
2508                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2509                                 else
2510                                     idct_add(ptr, h->mb + i*16, linesize);
2511                             }
2512                         }
2513                     }else
2514                     for(i=0; i<16; i++){
2515                         uint8_t * const ptr= dest_y + block_offset[i];
2516                         uint8_t *topright;
2517                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2518                         int nnz, tr;
2519
2520                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2521                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2522                             assert(mb_y || linesize <= block_offset[i]);
2523                             if(!topright_avail){
2524                                 tr= ptr[3 - linesize]*0x01010101;
2525                                 topright= (uint8_t*) &tr;
2526                             }else
2527                                 topright= ptr + 4 - linesize;
2528                         }else
2529                             topright= NULL;
2530
2531                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2532                         nnz = h->non_zero_count_cache[ scan8[i] ];
2533                         if(nnz){
2534                             if(is_h264){
2535                                 if(nnz == 1 && h->mb[i*16])
2536                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2537                                 else
2538                                     idct_add(ptr, h->mb + i*16, linesize);
2539                             }else
2540                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2541                         }
2542                     }
2543                 }
2544             }else{
2545                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2546                 if(is_h264){
2547                     if(!transform_bypass)
2548                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2549                 }else
2550                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2551             }
2552             if(h->deblocking_filter)
2553                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2554         }else if(is_h264){
2555             hl_motion(h, dest_y, dest_cb, dest_cr,
2556                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2557                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2558                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2559         }
2560
2561
2562         if(!IS_INTRA4x4(mb_type)){
2563             if(is_h264){
2564                 if(IS_INTRA16x16(mb_type)){
2565                     for(i=0; i<16; i++){
2566                         if(h->non_zero_count_cache[ scan8[i] ])
2567                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2568                         else if(h->mb[i*16])
2569                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2570                     }
2571                 }else{
2572                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2573                     for(i=0; i<16; i+=di){
2574                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2575                         if(nnz){
2576                             if(nnz==1 && h->mb[i*16])
2577                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2578                             else
2579                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2580                         }
2581                     }
2582                 }
2583             }else{
2584                 for(i=0; i<16; i++){
2585                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2586                         uint8_t * const ptr= dest_y + block_offset[i];
2587                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2588                     }
2589                 }
2590             }
2591         }
2592
2593         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2594             uint8_t *dest[2] = {dest_cb, dest_cr};
2595             if(transform_bypass){
2596                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2597             }else{
2598                 idct_add = s->dsp.h264_idct_add;
2599                 idct_dc_add = s->dsp.h264_idct_dc_add;
2600                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2601                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2602             }
2603             if(is_h264){
2604                 for(i=16; i<16+8; i++){
2605                     if(h->non_zero_count_cache[ scan8[i] ])
2606                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2607                     else if(h->mb[i*16])
2608                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2609                 }
2610             }else{
2611                 for(i=16; i<16+8; i++){
2612                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2613                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2614                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2615                     }
2616                 }
2617             }
2618         }
2619     }
2620     if(h->deblocking_filter) {
2621         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2622         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2623         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2624         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2625         if (!simple && FRAME_MBAFF) {
2626             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2627         } else {
2628             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2629         }
2630     }
2631 }
2632
2633 /**
2634  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2635  */
2636 static void hl_decode_mb_simple(H264Context *h){
2637     hl_decode_mb_internal(h, 1);
2638 }
2639
2640 /**
2641  * Process a macroblock; this handles edge cases, such as interlacing.
2642  */
2643 static void av_noinline hl_decode_mb_complex(H264Context *h){
2644     hl_decode_mb_internal(h, 0);
2645 }
2646
2647 static void hl_decode_mb(H264Context *h){
2648     MpegEncContext * const s = &h->s;
2649     const int mb_xy= h->mb_xy;
2650     const int mb_type= s->current_picture.mb_type[mb_xy];
2651     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2652                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2653
2654     if(ENABLE_H264_ENCODER && !s->decode)
2655         return;
2656
2657     if (is_complex)
2658         hl_decode_mb_complex(h);
2659     else hl_decode_mb_simple(h);
2660 }
2661
2662 static void pic_as_field(Picture *pic, const int parity){
2663     int i;
2664     for (i = 0; i < 4; ++i) {
2665         if (parity == PICT_BOTTOM_FIELD)
2666             pic->data[i] += pic->linesize[i];
2667         pic->reference = parity;
2668         pic->linesize[i] *= 2;
2669     }
2670     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2671 }
2672
2673 static int split_field_copy(Picture *dest, Picture *src,
2674                             int parity, int id_add){
2675     int match = !!(src->reference & parity);
2676
2677     if (match) {
2678         *dest = *src;
2679         if(parity != PICT_FRAME){
2680             pic_as_field(dest, parity);
2681             dest->pic_id *= 2;
2682             dest->pic_id += id_add;
2683         }
2684     }
2685
2686     return match;
2687 }
2688
2689 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2690     int i[2]={0};
2691     int index=0;
2692
2693     while(i[0]<len || i[1]<len){
2694         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2695             i[0]++;
2696         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2697             i[1]++;
2698         if(i[0] < len){
2699             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2700             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2701         }
2702         if(i[1] < len){
2703             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2704             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2705         }
2706     }
2707
2708     return index;
2709 }
2710
2711 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2712     int i, best_poc;
2713     int out_i= 0;
2714
2715     for(;;){
2716         best_poc= dir ? INT_MIN : INT_MAX;
2717
2718         for(i=0; i<len; i++){
2719             const int poc= src[i]->poc;
2720             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2721                 best_poc= poc;
2722                 sorted[out_i]= src[i];
2723             }
2724         }
2725         if(best_poc == (dir ? INT_MIN : INT_MAX))
2726             break;
2727         limit= sorted[out_i++]->poc - dir;
2728     }
2729     return out_i;
2730 }
2731
2732 /**
2733  * fills the default_ref_list.
2734  */
2735 static int fill_default_ref_list(H264Context *h){
2736     MpegEncContext * const s = &h->s;
2737     int i, len;
2738
2739     if(h->slice_type_nos==FF_B_TYPE){
2740         Picture *sorted[32];
2741         int cur_poc, list;
2742         int lens[2];
2743
2744         if(FIELD_PICTURE)
2745             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2746         else
2747             cur_poc= s->current_picture_ptr->poc;
2748
2749         for(list= 0; list<2; list++){
2750             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2751             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2752             assert(len<=32);
2753             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2754             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2755             assert(len<=32);
2756
2757             if(len < h->ref_count[list])
2758                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2759             lens[list]= len;
2760         }
2761
2762         if(lens[0] == lens[1] && lens[1] > 1){
2763             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2764             if(i == lens[0])
2765                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2766         }
2767     }else{
2768         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2769         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2770         assert(len <= 32);
2771         if(len < h->ref_count[0])
2772             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2773     }
2774 #ifdef TRACE
2775     for (i=0; i<h->ref_count[0]; i++) {
2776         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2777     }
2778     if(h->slice_type_nos==FF_B_TYPE){
2779         for (i=0; i<h->ref_count[1]; i++) {
2780             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2781         }
2782     }
2783 #endif
2784     return 0;
2785 }
2786
2787 static void print_short_term(H264Context *h);
2788 static void print_long_term(H264Context *h);
2789
2790 /**
2791  * Extract structure information about the picture described by pic_num in
2792  * the current decoding context (frame or field). Note that pic_num is
2793  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2794  * @param pic_num picture number for which to extract structure information
2795  * @param structure one of PICT_XXX describing structure of picture
2796  *                      with pic_num
2797  * @return frame number (short term) or long term index of picture
2798  *         described by pic_num
2799  */
2800 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2801     MpegEncContext * const s = &h->s;
2802
2803     *structure = s->picture_structure;
2804     if(FIELD_PICTURE){
2805         if (!(pic_num & 1))
2806             /* opposite field */
2807             *structure ^= PICT_FRAME;
2808         pic_num >>= 1;
2809     }
2810
2811     return pic_num;
2812 }
2813
2814 static int decode_ref_pic_list_reordering(H264Context *h){
2815     MpegEncContext * const s = &h->s;
2816     int list, index, pic_structure;
2817
2818     print_short_term(h);
2819     print_long_term(h);
2820
2821     for(list=0; list<h->list_count; list++){
2822         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2823
2824         if(get_bits1(&s->gb)){
2825             int pred= h->curr_pic_num;
2826
2827             for(index=0; ; index++){
2828                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2829                 unsigned int pic_id;
2830                 int i;
2831                 Picture *ref = NULL;
2832
2833                 if(reordering_of_pic_nums_idc==3)
2834                     break;
2835
2836                 if(index >= h->ref_count[list]){
2837                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2838                     return -1;
2839                 }
2840
2841                 if(reordering_of_pic_nums_idc<3){
2842                     if(reordering_of_pic_nums_idc<2){
2843                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2844                         int frame_num;
2845
2846                         if(abs_diff_pic_num > h->max_pic_num){
2847                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2848                             return -1;
2849                         }
2850
2851                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2852                         else                                pred+= abs_diff_pic_num;
2853                         pred &= h->max_pic_num - 1;
2854
2855                         frame_num = pic_num_extract(h, pred, &pic_structure);
2856
2857                         for(i= h->short_ref_count-1; i>=0; i--){
2858                             ref = h->short_ref[i];
2859                             assert(ref->reference);
2860                             assert(!ref->long_ref);
2861                             if(
2862                                    ref->frame_num == frame_num &&
2863                                    (ref->reference & pic_structure)
2864                               )
2865                                 break;
2866                         }
2867                         if(i>=0)
2868                             ref->pic_id= pred;
2869                     }else{
2870                         int long_idx;
2871                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2872
2873                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2874
2875                         if(long_idx>31){
2876                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2877                             return -1;
2878                         }
2879                         ref = h->long_ref[long_idx];
2880                         assert(!(ref && !ref->reference));
2881                         if(ref && (ref->reference & pic_structure)){
2882                             ref->pic_id= pic_id;
2883                             assert(ref->long_ref);
2884                             i=0;
2885                         }else{
2886                             i=-1;
2887                         }
2888                     }
2889
2890                     if (i < 0) {
2891                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2892                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2893                     } else {
2894                         for(i=index; i+1<h->ref_count[list]; i++){
2895                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2896                                 break;
2897                         }
2898                         for(; i > index; i--){
2899                             h->ref_list[list][i]= h->ref_list[list][i-1];
2900                         }
2901                         h->ref_list[list][index]= *ref;
2902                         if (FIELD_PICTURE){
2903                             pic_as_field(&h->ref_list[list][index], pic_structure);
2904                         }
2905                     }
2906                 }else{
2907                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2908                     return -1;
2909                 }
2910             }
2911         }
2912     }
2913     for(list=0; list<h->list_count; list++){
2914         for(index= 0; index < h->ref_count[list]; index++){
2915             if(!h->ref_list[list][index].data[0]){
2916                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2917                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2918             }
2919         }
2920     }
2921
2922     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
2923         direct_dist_scale_factor(h);
2924     direct_ref_list_init(h);
2925     return 0;
2926 }
2927
2928 static void fill_mbaff_ref_list(H264Context *h){
2929     int list, i, j;
2930     for(list=0; list<2; list++){ //FIXME try list_count
2931         for(i=0; i<h->ref_count[list]; i++){
2932             Picture *frame = &h->ref_list[list][i];
2933             Picture *field = &h->ref_list[list][16+2*i];
2934             field[0] = *frame;
2935             for(j=0; j<3; j++)
2936                 field[0].linesize[j] <<= 1;
2937             field[0].reference = PICT_TOP_FIELD;
2938             field[1] = field[0];
2939             for(j=0; j<3; j++)
2940                 field[1].data[j] += frame->linesize[j];
2941             field[1].reference = PICT_BOTTOM_FIELD;
2942
2943             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2944             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2945             for(j=0; j<2; j++){
2946                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2947                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2948             }
2949         }
2950     }
2951     for(j=0; j<h->ref_count[1]; j++){
2952         for(i=0; i<h->ref_count[0]; i++)
2953             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2954         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2955         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2956     }
2957 }
2958
2959 static int pred_weight_table(H264Context *h){
2960     MpegEncContext * const s = &h->s;
2961     int list, i;
2962     int luma_def, chroma_def;
2963
2964     h->use_weight= 0;
2965     h->use_weight_chroma= 0;
2966     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2967     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2968     luma_def = 1<<h->luma_log2_weight_denom;
2969     chroma_def = 1<<h->chroma_log2_weight_denom;
2970
2971     for(list=0; list<2; list++){
2972         for(i=0; i<h->ref_count[list]; i++){
2973             int luma_weight_flag, chroma_weight_flag;
2974
2975             luma_weight_flag= get_bits1(&s->gb);
2976             if(luma_weight_flag){
2977                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2978                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2979                 if(   h->luma_weight[list][i] != luma_def
2980                    || h->luma_offset[list][i] != 0)
2981                     h->use_weight= 1;
2982             }else{
2983                 h->luma_weight[list][i]= luma_def;
2984                 h->luma_offset[list][i]= 0;
2985             }
2986
2987             if(CHROMA){
2988                 chroma_weight_flag= get_bits1(&s->gb);
2989                 if(chroma_weight_flag){
2990                     int j;
2991                     for(j=0; j<2; j++){
2992                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2993                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2994                         if(   h->chroma_weight[list][i][j] != chroma_def
2995                         || h->chroma_offset[list][i][j] != 0)
2996                             h->use_weight_chroma= 1;
2997                     }
2998                 }else{
2999                     int j;
3000                     for(j=0; j<2; j++){
3001                         h->chroma_weight[list][i][j]= chroma_def;
3002                         h->chroma_offset[list][i][j]= 0;
3003                     }
3004                 }
3005             }
3006         }
3007         if(h->slice_type_nos != FF_B_TYPE) break;
3008     }
3009     h->use_weight= h->use_weight || h->use_weight_chroma;
3010     return 0;
3011 }
3012
3013 static void implicit_weight_table(H264Context *h){
3014     MpegEncContext * const s = &h->s;
3015     int ref0, ref1;
3016     int cur_poc = s->current_picture_ptr->poc;
3017
3018     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3019        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3020         h->use_weight= 0;
3021         h->use_weight_chroma= 0;
3022         return;
3023     }
3024
3025     h->use_weight= 2;
3026     h->use_weight_chroma= 2;
3027     h->luma_log2_weight_denom= 5;
3028     h->chroma_log2_weight_denom= 5;
3029
3030     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3031         int poc0 = h->ref_list[0][ref0].poc;
3032         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3033             int poc1 = h->ref_list[1][ref1].poc;
3034             int td = av_clip(poc1 - poc0, -128, 127);
3035             if(td){
3036                 int tb = av_clip(cur_poc - poc0, -128, 127);
3037                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3038                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3039                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3040                     h->implicit_weight[ref0][ref1] = 32;
3041                 else
3042                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3043             }else
3044                 h->implicit_weight[ref0][ref1] = 32;
3045         }
3046     }
3047 }
3048
3049 /**
3050  * Mark a picture as no longer needed for reference. The refmask
3051  * argument allows unreferencing of individual fields or the whole frame.
3052  * If the picture becomes entirely unreferenced, but is being held for
3053  * display purposes, it is marked as such.
3054  * @param refmask mask of fields to unreference; the mask is bitwise
3055  *                anded with the reference marking of pic
3056  * @return non-zero if pic becomes entirely unreferenced (except possibly
3057  *         for display purposes) zero if one of the fields remains in
3058  *         reference
3059  */
3060 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3061     int i;
3062     if (pic->reference &= refmask) {
3063         return 0;
3064     } else {
3065         for(i = 0; h->delayed_pic[i]; i++)
3066             if(pic == h->delayed_pic[i]){
3067                 pic->reference=DELAYED_PIC_REF;
3068                 break;
3069             }
3070         return 1;
3071     }
3072 }
3073
3074 /**
3075  * instantaneous decoder refresh.
3076  */
3077 static void idr(H264Context *h){
3078     int i;
3079
3080     for(i=0; i<16; i++){
3081         remove_long(h, i, 0);
3082     }
3083     assert(h->long_ref_count==0);
3084
3085     for(i=0; i<h->short_ref_count; i++){
3086         unreference_pic(h, h->short_ref[i], 0);
3087         h->short_ref[i]= NULL;
3088     }
3089     h->short_ref_count=0;
3090     h->prev_frame_num= 0;
3091     h->prev_frame_num_offset= 0;
3092     h->prev_poc_msb=
3093     h->prev_poc_lsb= 0;
3094 }
3095
3096 /* forget old pics after a seek */
3097 static void flush_dpb(AVCodecContext *avctx){
3098     H264Context *h= avctx->priv_data;
3099     int i;
3100     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3101         if(h->delayed_pic[i])
3102             h->delayed_pic[i]->reference= 0;
3103         h->delayed_pic[i]= NULL;
3104     }
3105     h->outputed_poc= INT_MIN;
3106     idr(h);
3107     if(h->s.current_picture_ptr)
3108         h->s.current_picture_ptr->reference= 0;
3109     h->s.first_field= 0;
3110     ff_mpeg_flush(avctx);
3111 }
3112
3113 /**
3114  * Find a Picture in the short term reference list by frame number.
3115  * @param frame_num frame number to search for
3116  * @param idx the index into h->short_ref where returned picture is found
3117  *            undefined if no picture found.
3118  * @return pointer to the found picture, or NULL if no pic with the provided
3119  *                 frame number is found
3120  */
3121 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3122     MpegEncContext * const s = &h->s;
3123     int i;
3124
3125     for(i=0; i<h->short_ref_count; i++){
3126         Picture *pic= h->short_ref[i];
3127         if(s->avctx->debug&FF_DEBUG_MMCO)
3128             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3129         if(pic->frame_num == frame_num) {
3130             *idx = i;
3131             return pic;
3132         }
3133     }
3134     return NULL;
3135 }
3136
3137 /**
3138  * Remove a picture from the short term reference list by its index in
3139  * that list.  This does no checking on the provided index; it is assumed
3140  * to be valid. Other list entries are shifted down.
3141  * @param i index into h->short_ref of picture to remove.
3142  */
3143 static void remove_short_at_index(H264Context *h, int i){
3144     assert(i >= 0 && i < h->short_ref_count);
3145     h->short_ref[i]= NULL;
3146     if (--h->short_ref_count)
3147         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3148 }
3149
3150 /**
3151  *
3152  * @return the removed picture or NULL if an error occurs
3153  */
3154 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3155     MpegEncContext * const s = &h->s;
3156     Picture *pic;
3157     int i;
3158
3159     if(s->avctx->debug&FF_DEBUG_MMCO)
3160         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3161
3162     pic = find_short(h, frame_num, &i);
3163     if (pic){
3164         if(unreference_pic(h, pic, ref_mask))
3165         remove_short_at_index(h, i);
3166     }
3167
3168     return pic;
3169 }
3170
3171 /**
3172  * Remove a picture from the long term reference list by its index in
3173  * that list.
3174  * @return the removed picture or NULL if an error occurs
3175  */
3176 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3177     Picture *pic;
3178
3179     pic= h->long_ref[i];
3180     if (pic){
3181         if(unreference_pic(h, pic, ref_mask)){
3182             assert(h->long_ref[i]->long_ref == 1);
3183             h->long_ref[i]->long_ref= 0;
3184             h->long_ref[i]= NULL;
3185             h->long_ref_count--;
3186         }
3187     }
3188
3189     return pic;
3190 }
3191
3192 /**
3193  * print short term list
3194  */
3195 static void print_short_term(H264Context *h) {
3196     uint32_t i;
3197     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3198         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3199         for(i=0; i<h->short_ref_count; i++){
3200             Picture *pic= h->short_ref[i];
3201             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3202         }
3203     }
3204 }
3205
3206 /**
3207  * print long term list
3208  */
3209 static void print_long_term(H264Context *h) {
3210     uint32_t i;
3211     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3212         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3213         for(i = 0; i < 16; i++){
3214             Picture *pic= h->long_ref[i];
3215             if (pic) {
3216                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3217             }
3218         }
3219     }
3220 }
3221
3222 /**
3223  * Executes the reference picture marking (memory management control operations).
3224  */
3225 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3226     MpegEncContext * const s = &h->s;
3227     int i, j;
3228     int current_ref_assigned=0;
3229     Picture *pic;
3230
3231     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3232         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3233
3234     for(i=0; i<mmco_count; i++){
3235         int structure, frame_num;
3236         if(s->avctx->debug&FF_DEBUG_MMCO)
3237             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3238
3239         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3240            || mmco[i].opcode == MMCO_SHORT2LONG){
3241             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3242             pic = find_short(h, frame_num, &j);
3243             if(!pic){
3244                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3245                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3246                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3247                 continue;
3248             }
3249         }
3250
3251         switch(mmco[i].opcode){
3252         case MMCO_SHORT2UNUSED:
3253             if(s->avctx->debug&FF_DEBUG_MMCO)
3254                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3255             remove_short(h, frame_num, structure ^ PICT_FRAME);
3256             break;
3257         case MMCO_SHORT2LONG:
3258                 if (h->long_ref[mmco[i].long_arg] != pic)
3259                     remove_long(h, mmco[i].long_arg, 0);
3260
3261                 remove_short_at_index(h, j);
3262                 h->long_ref[ mmco[i].long_arg ]= pic;
3263                 if (h->long_ref[ mmco[i].long_arg ]){
3264                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3265                     h->long_ref_count++;
3266                 }
3267             break;
3268         case MMCO_LONG2UNUSED:
3269             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3270             pic = h->long_ref[j];
3271             if (pic) {
3272                 remove_long(h, j, structure ^ PICT_FRAME);
3273             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3274                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3275             break;
3276         case MMCO_LONG:
3277                     // Comment below left from previous code as it is an interresting note.
3278                     /* First field in pair is in short term list or
3279                      * at a different long term index.
3280                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3281                      * Report the problem and keep the pair where it is,
3282                      * and mark this field valid.
3283                      */
3284
3285             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3286                 remove_long(h, mmco[i].long_arg, 0);
3287
3288                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3289                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3290                 h->long_ref_count++;
3291             }
3292
3293             s->current_picture_ptr->reference |= s->picture_structure;
3294             current_ref_assigned=1;
3295             break;
3296         case MMCO_SET_MAX_LONG:
3297             assert(mmco[i].long_arg <= 16);
3298             // just remove the long term which index is greater than new max
3299             for(j = mmco[i].long_arg; j<16; j++){
3300                 remove_long(h, j, 0);
3301             }
3302             break;
3303         case MMCO_RESET:
3304             while(h->short_ref_count){
3305                 remove_short(h, h->short_ref[0]->frame_num, 0);
3306             }
3307             for(j = 0; j < 16; j++) {
3308                 remove_long(h, j, 0);
3309             }
3310             s->current_picture_ptr->poc=
3311             s->current_picture_ptr->field_poc[0]=
3312             s->current_picture_ptr->field_poc[1]=
3313             h->poc_lsb=
3314             h->poc_msb=
3315             h->frame_num=
3316             s->current_picture_ptr->frame_num= 0;
3317             break;
3318         default: assert(0);
3319         }
3320     }
3321
3322     if (!current_ref_assigned) {
3323         /* Second field of complementary field pair; the first field of
3324          * which is already referenced. If short referenced, it
3325          * should be first entry in short_ref. If not, it must exist
3326          * in long_ref; trying to put it on the short list here is an
3327          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3328          */
3329         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3330             /* Just mark the second field valid */
3331             s->current_picture_ptr->reference = PICT_FRAME;
3332         } else if (s->current_picture_ptr->long_ref) {
3333             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3334                                              "assignment for second field "
3335                                              "in complementary field pair "
3336                                              "(first field is long term)\n");
3337         } else {
3338             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3339             if(pic){
3340                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3341             }
3342
3343             if(h->short_ref_count)
3344                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3345
3346             h->short_ref[0]= s->current_picture_ptr;
3347             h->short_ref_count++;
3348             s->current_picture_ptr->reference |= s->picture_structure;
3349         }
3350     }
3351
3352     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3353
3354         /* We have too many reference frames, probably due to corrupted
3355          * stream. Need to discard one frame. Prevents overrun of the
3356          * short_ref and long_ref buffers.
3357          */
3358         av_log(h->s.avctx, AV_LOG_ERROR,
3359                "number of reference frames exceeds max (probably "
3360                "corrupt input), discarding one\n");
3361
3362         if (h->long_ref_count && !h->short_ref_count) {
3363             for (i = 0; i < 16; ++i)
3364                 if (h->long_ref[i])
3365                     break;
3366
3367             assert(i < 16);
3368             remove_long(h, i, 0);
3369         } else {
3370             pic = h->short_ref[h->short_ref_count - 1];
3371             remove_short(h, pic->frame_num, 0);
3372         }
3373     }
3374
3375     print_short_term(h);
3376     print_long_term(h);
3377     return 0;
3378 }
3379
3380 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3381     MpegEncContext * const s = &h->s;
3382     int i;
3383
3384     h->mmco_index= 0;
3385     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3386         s->broken_link= get_bits1(gb) -1;
3387         if(get_bits1(gb)){
3388             h->mmco[0].opcode= MMCO_LONG;
3389             h->mmco[0].long_arg= 0;
3390             h->mmco_index= 1;
3391         }
3392     }else{
3393         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3394             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3395                 MMCOOpcode opcode= get_ue_golomb(gb);
3396
3397                 h->mmco[i].opcode= opcode;
3398                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3399                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3400 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3401                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3402                         return -1;
3403                     }*/
3404                 }
3405                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3406                     unsigned int long_arg= get_ue_golomb(gb);
3407                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3408                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3409                         return -1;
3410                     }
3411                     h->mmco[i].long_arg= long_arg;
3412                 }
3413
3414                 if(opcode > (unsigned)MMCO_LONG){
3415                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3416                     return -1;
3417                 }
3418                 if(opcode == MMCO_END)
3419                     break;
3420             }
3421             h->mmco_index= i;
3422         }else{
3423             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3424
3425             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3426                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3427                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3428                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3429                 h->mmco_index= 1;
3430                 if (FIELD_PICTURE) {
3431                     h->mmco[0].short_pic_num *= 2;
3432                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3433                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3434                     h->mmco_index= 2;
3435                 }
3436             }
3437         }
3438     }
3439
3440     return 0;
3441 }
3442
3443 static int init_poc(H264Context *h){
3444     MpegEncContext * const s = &h->s;
3445     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3446     int field_poc[2];
3447     Picture *cur = s->current_picture_ptr;
3448
3449     h->frame_num_offset= h->prev_frame_num_offset;
3450     if(h->frame_num < h->prev_frame_num)
3451         h->frame_num_offset += max_frame_num;
3452
3453     if(h->sps.poc_type==0){
3454         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3455
3456         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3457             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3458         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3459             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3460         else
3461             h->poc_msb = h->prev_poc_msb;
3462 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3463         field_poc[0] =
3464         field_poc[1] = h->poc_msb + h->poc_lsb;
3465         if(s->picture_structure == PICT_FRAME)
3466             field_poc[1] += h->delta_poc_bottom;
3467     }else if(h->sps.poc_type==1){
3468         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3469         int i;
3470
3471         if(h->sps.poc_cycle_length != 0)
3472             abs_frame_num = h->frame_num_offset + h->frame_num;
3473         else
3474             abs_frame_num = 0;
3475
3476         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3477             abs_frame_num--;
3478
3479         expected_delta_per_poc_cycle = 0;
3480         for(i=0; i < h->sps.poc_cycle_length; i++)
3481             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3482
3483         if(abs_frame_num > 0){
3484             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3485             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3486
3487             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3488             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3489                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3490         } else
3491             expectedpoc = 0;
3492
3493         if(h->nal_ref_idc == 0)
3494             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3495
3496         field_poc[0] = expectedpoc + h->delta_poc[0];
3497         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3498
3499         if(s->picture_structure == PICT_FRAME)
3500             field_poc[1] += h->delta_poc[1];
3501     }else{
3502         int poc= 2*(h->frame_num_offset + h->frame_num);
3503
3504         if(!h->nal_ref_idc)
3505             poc--;
3506
3507         field_poc[0]= poc;
3508         field_poc[1]= poc;
3509     }
3510
3511     if(s->picture_structure != PICT_BOTTOM_FIELD)
3512         s->current_picture_ptr->field_poc[0]= field_poc[0];
3513     if(s->picture_structure != PICT_TOP_FIELD)
3514         s->current_picture_ptr->field_poc[1]= field_poc[1];
3515     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3516
3517     return 0;
3518 }
3519
3520
3521 /**
3522  * initialize scan tables
3523  */
3524 static void init_scan_tables(H264Context *h){
3525     MpegEncContext * const s = &h->s;
3526     int i;
3527     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3528         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3529         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3530     }else{
3531         for(i=0; i<16; i++){
3532 #define T(x) (x>>2) | ((x<<2) & 0xF)
3533             h->zigzag_scan[i] = T(zigzag_scan[i]);
3534             h-> field_scan[i] = T( field_scan[i]);
3535 #undef T
3536         }
3537     }
3538     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3539         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3540         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3541         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3542         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3543     }else{
3544         for(i=0; i<64; i++){
3545 #define T(x) (x>>3) | ((x&7)<<3)
3546             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3547             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3548             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3549             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3550 #undef T
3551         }
3552     }
3553     if(h->sps.transform_bypass){ //FIXME same ugly
3554         h->zigzag_scan_q0          = zigzag_scan;
3555         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3556         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3557         h->field_scan_q0           = field_scan;
3558         h->field_scan8x8_q0        = field_scan8x8;
3559         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3560     }else{
3561         h->zigzag_scan_q0          = h->zigzag_scan;
3562         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3563         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3564         h->field_scan_q0           = h->field_scan;
3565         h->field_scan8x8_q0        = h->field_scan8x8;
3566         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3567     }
3568 }
3569
3570 /**
3571  * Replicates H264 "master" context to thread contexts.
3572  */
3573 static void clone_slice(H264Context *dst, H264Context *src)
3574 {
3575     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3576     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3577     dst->s.current_picture      = src->s.current_picture;
3578     dst->s.linesize             = src->s.linesize;
3579     dst->s.uvlinesize           = src->s.uvlinesize;
3580     dst->s.first_field          = src->s.first_field;
3581
3582     dst->prev_poc_msb           = src->prev_poc_msb;
3583     dst->prev_poc_lsb           = src->prev_poc_lsb;
3584     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3585     dst->prev_frame_num         = src->prev_frame_num;
3586     dst->short_ref_count        = src->short_ref_count;
3587
3588     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3589     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3590     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3591     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3592
3593     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3594     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3595 }
3596
3597 /**
3598  * decodes a slice header.
3599  * This will also call MPV_common_init() and frame_start() as needed.
3600  *
3601  * @param h h264context
3602  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3603  *
3604  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3605  */
3606 static int decode_slice_header(H264Context *h, H264Context *h0){
3607     MpegEncContext * const s = &h->s;
3608     MpegEncContext * const s0 = &h0->s;
3609     unsigned int first_mb_in_slice;
3610     unsigned int pps_id;
3611     int num_ref_idx_active_override_flag;
3612     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3613     unsigned int slice_type, tmp, i, j;
3614     int default_ref_list_done = 0;
3615     int last_pic_structure;
3616
3617     s->dropable= h->nal_ref_idc == 0;
3618
3619     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3620         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3621         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3622     }else{
3623         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3624         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3625     }
3626
3627     first_mb_in_slice= get_ue_golomb(&s->gb);
3628
3629     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3630         h0->current_slice = 0;
3631         if (!s0->first_field)
3632             s->current_picture_ptr= NULL;
3633     }
3634
3635     slice_type= get_ue_golomb(&s->gb);
3636     if(slice_type > 9){
3637         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3638         return -1;
3639     }
3640     if(slice_type > 4){
3641         slice_type -= 5;
3642         h->slice_type_fixed=1;
3643     }else
3644         h->slice_type_fixed=0;
3645
3646     slice_type= slice_type_map[ slice_type ];
3647     if (slice_type == FF_I_TYPE
3648         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3649         default_ref_list_done = 1;
3650     }
3651     h->slice_type= slice_type;
3652     h->slice_type_nos= slice_type & 3;
3653
3654     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3655     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3656         av_log(h->s.avctx, AV_LOG_ERROR,
3657                "B picture before any references, skipping\n");
3658         return -1;
3659     }
3660
3661     pps_id= get_ue_golomb(&s->gb);
3662     if(pps_id>=MAX_PPS_COUNT){
3663         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3664         return -1;
3665     }
3666     if(!h0->pps_buffers[pps_id]) {
3667         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3668         return -1;
3669     }
3670     h->pps= *h0->pps_buffers[pps_id];
3671
3672     if(!h0->sps_buffers[h->pps.sps_id]) {
3673         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3674         return -1;
3675     }
3676     h->sps = *h0->sps_buffers[h->pps.sps_id];
3677
3678     if(h == h0 && h->dequant_coeff_pps != pps_id){
3679         h->dequant_coeff_pps = pps_id;
3680         init_dequant_tables(h);
3681     }
3682
3683     s->mb_width= h->sps.mb_width;
3684     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3685
3686     h->b_stride=  s->mb_width*4;
3687     h->b8_stride= s->mb_width*2;
3688
3689     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3690     if(h->sps.frame_mbs_only_flag)
3691         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3692     else
3693         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3694
3695     if (s->context_initialized
3696         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3697         if(h != h0)
3698             return -1;   // width / height changed during parallelized decoding
3699         free_tables(h);
3700         MPV_common_end(s);
3701     }
3702     if (!s->context_initialized) {
3703         if(h != h0)
3704             return -1;  // we cant (re-)initialize context during parallel decoding
3705         if (MPV_common_init(s) < 0)
3706             return -1;
3707         s->first_field = 0;
3708
3709         init_scan_tables(h);
3710         alloc_tables(h);
3711
3712         for(i = 1; i < s->avctx->thread_count; i++) {
3713             H264Context *c;
3714             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3715             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3716             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3717             c->sps = h->sps;
3718             c->pps = h->pps;
3719             init_scan_tables(c);
3720             clone_tables(c, h);
3721         }
3722
3723         for(i = 0; i < s->avctx->thread_count; i++)
3724             if(context_init(h->thread_context[i]) < 0)
3725                 return -1;
3726
3727         s->avctx->width = s->width;
3728         s->avctx->height = s->height;
3729         s->avctx->sample_aspect_ratio= h->sps.sar;
3730         if(!s->avctx->sample_aspect_ratio.den)
3731             s->avctx->sample_aspect_ratio.den = 1;
3732
3733         if(h->sps.timing_info_present_flag){
3734             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3735             if(h->x264_build > 0 && h->x264_build < 44)
3736                 s->avctx->time_base.den *= 2;
3737             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3738                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3739         }
3740     }
3741
3742     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3743
3744     h->mb_mbaff = 0;
3745     h->mb_aff_frame = 0;
3746     last_pic_structure = s0->picture_structure;
3747     if(h->sps.frame_mbs_only_flag){
3748         s->picture_structure= PICT_FRAME;
3749     }else{
3750         if(get_bits1(&s->gb)) { //field_pic_flag
3751             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3752         } else {
3753             s->picture_structure= PICT_FRAME;
3754             h->mb_aff_frame = h->sps.mb_aff;
3755         }
3756     }
3757     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3758
3759     if(h0->current_slice == 0){
3760         while(h->frame_num !=  h->prev_frame_num &&
3761               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3762             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3763             frame_start(h);
3764             h->prev_frame_num++;
3765             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3766             s->current_picture_ptr->frame_num= h->prev_frame_num;
3767             execute_ref_pic_marking(h, NULL, 0);
3768         }
3769
3770         /* See if we have a decoded first field looking for a pair... */
3771         if (s0->first_field) {
3772             assert(s0->current_picture_ptr);
3773             assert(s0->current_picture_ptr->data[0]);
3774             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3775
3776             /* figure out if we have a complementary field pair */
3777             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3778                 /*
3779                  * Previous field is unmatched. Don't display it, but let it
3780                  * remain for reference if marked as such.
3781                  */
3782                 s0->current_picture_ptr = NULL;
3783                 s0->first_field = FIELD_PICTURE;
3784
3785             } else {
3786                 if (h->nal_ref_idc &&
3787                         s0->current_picture_ptr->reference &&
3788                         s0->current_picture_ptr->frame_num != h->frame_num) {
3789                     /*
3790                      * This and previous field were reference, but had
3791                      * different frame_nums. Consider this field first in
3792                      * pair. Throw away previous field except for reference
3793                      * purposes.
3794                      */
3795                     s0->first_field = 1;
3796                     s0->current_picture_ptr = NULL;
3797
3798                 } else {
3799                     /* Second field in complementary pair */
3800                     s0->first_field = 0;
3801                 }
3802             }
3803
3804         } else {
3805             /* Frame or first field in a potentially complementary pair */
3806             assert(!s0->current_picture_ptr);
3807             s0->first_field = FIELD_PICTURE;
3808         }
3809
3810         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3811             s0->first_field = 0;
3812             return -1;
3813         }
3814     }
3815     if(h != h0)
3816         clone_slice(h, h0);
3817
3818     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3819
3820     assert(s->mb_num == s->mb_width * s->mb_height);
3821     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3822        first_mb_in_slice                    >= s->mb_num){
3823         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3824         return -1;
3825     }
3826     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3827     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3828     if (s->picture_structure == PICT_BOTTOM_FIELD)
3829         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3830     assert(s->mb_y < s->mb_height);
3831
3832     if(s->picture_structure==PICT_FRAME){
3833         h->curr_pic_num=   h->frame_num;
3834         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3835     }else{
3836         h->curr_pic_num= 2*h->frame_num + 1;
3837         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3838     }
3839
3840     if(h->nal_unit_type == NAL_IDR_SLICE){
3841         get_ue_golomb(&s->gb); /* idr_pic_id */
3842     }
3843
3844     if(h->sps.poc_type==0){
3845         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3846
3847         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3848             h->delta_poc_bottom= get_se_golomb(&s->gb);
3849         }
3850     }
3851
3852     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3853         h->delta_poc[0]= get_se_golomb(&s->gb);
3854
3855         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3856             h->delta_poc[1]= get_se_golomb(&s->gb);
3857     }
3858
3859     init_poc(h);
3860
3861     if(h->pps.redundant_pic_cnt_present){
3862         h->redundant_pic_count= get_ue_golomb(&s->gb);
3863     }
3864
3865     //set defaults, might be overridden a few lines later
3866     h->ref_count[0]= h->pps.ref_count[0];
3867     h->ref_count[1]= h->pps.ref_count[1];
3868
3869     if(h->slice_type_nos != FF_I_TYPE){
3870         if(h->slice_type_nos == FF_B_TYPE){
3871             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3872         }
3873         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3874
3875         if(num_ref_idx_active_override_flag){
3876             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3877             if(h->slice_type_nos==FF_B_TYPE)
3878                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3879
3880             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3881                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3882                 h->ref_count[0]= h->ref_count[1]= 1;
3883                 return -1;
3884             }
3885         }
3886         if(h->slice_type_nos == FF_B_TYPE)
3887             h->list_count= 2;
3888         else
3889             h->list_count= 1;
3890     }else
3891         h->list_count= 0;
3892
3893     if(!default_ref_list_done){
3894         fill_default_ref_list(h);
3895     }
3896
3897     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3898         return -1;
3899
3900     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3901        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3902         pred_weight_table(h);
3903     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3904         implicit_weight_table(h);
3905     else
3906         h->use_weight = 0;
3907
3908     if(h->nal_ref_idc)
3909         decode_ref_pic_marking(h0, &s->gb);
3910
3911     if(FRAME_MBAFF)
3912         fill_mbaff_ref_list(h);
3913
3914     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3915         tmp = get_ue_golomb(&s->gb);
3916         if(tmp > 2){
3917             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3918             return -1;
3919         }
3920         h->cabac_init_idc= tmp;
3921     }
3922
3923     h->last_qscale_diff = 0;
3924     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3925     if(tmp>51){
3926         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3927         return -1;
3928     }
3929     s->qscale= tmp;
3930     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3931     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3932     //FIXME qscale / qp ... stuff
3933     if(h->slice_type == FF_SP_TYPE){
3934         get_bits1(&s->gb); /* sp_for_switch_flag */
3935     }
3936     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3937         get_se_golomb(&s->gb); /* slice_qs_delta */
3938     }
3939
3940     h->deblocking_filter = 1;
3941     h->slice_alpha_c0_offset = 0;
3942     h->slice_beta_offset = 0;
3943     if( h->pps.deblocking_filter_parameters_present ) {
3944         tmp= get_ue_golomb(&s->gb);
3945         if(tmp > 2){
3946             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3947             return -1;
3948         }
3949         h->deblocking_filter= tmp;
3950         if(h->deblocking_filter < 2)
3951             h->deblocking_filter^= 1; // 1<->0
3952
3953         if( h->deblocking_filter ) {
3954             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3955             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3956         }
3957     }
3958
3959     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3960        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3961        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3962        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3963         h->deblocking_filter= 0;
3964
3965     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3966         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3967             /* Cheat slightly for speed:
3968                Do not bother to deblock across slices. */
3969             h->deblocking_filter = 2;
3970         } else {
3971             h0->max_contexts = 1;
3972             if(!h0->single_decode_warning) {
3973                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3974                 h0->single_decode_warning = 1;
3975             }
3976             if(h != h0)
3977                 return 1; // deblocking switched inside frame
3978         }
3979     }
3980
3981 #if 0 //FMO
3982     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3983         slice_group_change_cycle= get_bits(&s->gb, ?);
3984 #endif
3985
3986     h0->last_slice_type = slice_type;
3987     h->slice_num = ++h0->current_slice;
3988
3989     for(j=0; j<2; j++){
3990         int *ref2frm= h->ref2frm[h->slice_num&15][j];
3991         ref2frm[0]=
3992         ref2frm[1]= -1;
3993         for(i=0; i<16; i++)
3994             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3995                           +(h->ref_list[j][i].reference&3);
3996         ref2frm[18+0]=
3997         ref2frm[18+1]= -1;
3998         for(i=16; i<48; i++)
3999             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4000                           +(h->ref_list[j][i].reference&3);
4001     }
4002
4003     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4004     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4005
4006     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4007         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4008                h->slice_num,
4009                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4010                first_mb_in_slice,
4011                av_get_pict_type_char(h->slice_type),
4012                pps_id, h->frame_num,
4013                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4014                h->ref_count[0], h->ref_count[1],
4015                s->qscale,
4016                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4017                h->use_weight,
4018                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4019                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4020                );
4021     }
4022
4023     return 0;
4024 }
4025
4026 /**
4027  *
4028  */
4029 static inline int get_level_prefix(GetBitContext *gb){
4030     unsigned int buf;
4031     int log;
4032
4033     OPEN_READER(re, gb);
4034     UPDATE_CACHE(re, gb);
4035     buf=GET_CACHE(re, gb);
4036
4037     log= 32 - av_log2(buf);
4038 #ifdef TRACE
4039     print_bin(buf>>(32-log), log);
4040     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4041 #endif
4042
4043     LAST_SKIP_BITS(re, gb, log);
4044     CLOSE_READER(re, gb);
4045
4046     return log-1;
4047 }
4048
4049 static inline int get_dct8x8_allowed(H264Context *h){
4050     int i;
4051     for(i=0; i<4; i++){
4052         if(!IS_SUB_8X8(h->sub_mb_type[i])
4053            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4054             return 0;
4055     }
4056     return 1;
4057 }
4058
4059 /**
4060  * decodes a residual block.
4061  * @param n block index
4062  * @param scantable scantable
4063  * @param max_coeff number of coefficients in the block
4064  * @return <0 if an error occurred
4065  */
4066 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4067     MpegEncContext * const s = &h->s;
4068     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4069     int level[16];
4070     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4071
4072     //FIXME put trailing_onex into the context
4073
4074     if(n == CHROMA_DC_BLOCK_INDEX){
4075         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4076         total_coeff= coeff_token>>2;
4077     }else{
4078         if(n == LUMA_DC_BLOCK_INDEX){
4079             total_coeff= pred_non_zero_count(h, 0);
4080             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4081             total_coeff= coeff_token>>2;
4082         }else{
4083             total_coeff= pred_non_zero_count(h, n);
4084             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4085             total_coeff= coeff_token>>2;
4086             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4087         }
4088     }
4089
4090     //FIXME set last_non_zero?
4091
4092     if(total_coeff==0)
4093         return 0;
4094     if(total_coeff > (unsigned)max_coeff) {
4095         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4096         return -1;
4097     }
4098
4099     trailing_ones= coeff_token&3;
4100     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4101     assert(total_coeff<=16);
4102
4103     for(i=0; i<trailing_ones; i++){
4104         level[i]= 1 - 2*get_bits1(gb);
4105     }
4106
4107     if(i<total_coeff) {
4108         int level_code, mask;
4109         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4110         int prefix= get_level_prefix(gb);
4111
4112         //first coefficient has suffix_length equal to 0 or 1
4113         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4114             if(suffix_length)
4115                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4116             else
4117                 level_code= (prefix<<suffix_length); //part
4118         }else if(prefix==14){
4119             if(suffix_length)
4120                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4121             else
4122                 level_code= prefix + get_bits(gb, 4); //part
4123         }else{
4124             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4125             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4126             if(prefix>=16)
4127                 level_code += (1<<(prefix-3))-4096;
4128         }
4129
4130         if(trailing_ones < 3) level_code += 2;
4131
4132         suffix_length = 1;
4133         if(level_code > 5)
4134             suffix_length++;
4135         mask= -(level_code&1);
4136         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4137         i++;
4138
4139         //remaining coefficients have suffix_length > 0
4140         for(;i<total_coeff;i++) {
4141             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4142             prefix = get_level_prefix(gb);
4143             if(prefix<15){
4144                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4145             }else{
4146                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4147                 if(prefix>=16)
4148                     level_code += (1<<(prefix-3))-4096;
4149             }
4150             mask= -(level_code&1);
4151             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4152             if(level_code > suffix_limit[suffix_length])
4153                 suffix_length++;
4154         }
4155     }
4156
4157     if(total_coeff == max_coeff)
4158         zeros_left=0;
4159     else{
4160         if(n == CHROMA_DC_BLOCK_INDEX)
4161             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4162         else
4163             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4164     }
4165
4166     coeff_num = zeros_left + total_coeff - 1;
4167     j = scantable[coeff_num];
4168     if(n > 24){
4169         block[j] = level[0];
4170         for(i=1;i<total_coeff;i++) {
4171             if(zeros_left <= 0)
4172                 run_before = 0;
4173             else if(zeros_left < 7){
4174                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4175             }else{
4176                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4177             }
4178             zeros_left -= run_before;
4179             coeff_num -= 1 + run_before;
4180             j= scantable[ coeff_num ];
4181
4182             block[j]= level[i];
4183         }
4184     }else{
4185         block[j] = (level[0] * qmul[j] + 32)>>6;
4186         for(i=1;i<total_coeff;i++) {
4187             if(zeros_left <= 0)
4188                 run_before = 0;
4189             else if(zeros_left < 7){
4190                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4191             }else{
4192                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4193             }
4194             zeros_left -= run_before;
4195             coeff_num -= 1 + run_before;
4196             j= scantable[ coeff_num ];
4197
4198             block[j]= (level[i] * qmul[j] + 32)>>6;
4199         }
4200     }
4201
4202     if(zeros_left<0){
4203         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4204         return -1;
4205     }
4206
4207     return 0;
4208 }
4209
4210 static void predict_field_decoding_flag(H264Context *h){
4211     MpegEncContext * const s = &h->s;
4212     const int mb_xy= h->mb_xy;
4213     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4214                 ? s->current_picture.mb_type[mb_xy-1]
4215                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4216                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4217                 : 0;
4218     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4219 }
4220
4221 /**
4222  * decodes a P_SKIP or B_SKIP macroblock
4223  */
4224 static void decode_mb_skip(H264Context *h){
4225     MpegEncContext * const s = &h->s;
4226     const int mb_xy= h->mb_xy;
4227     int mb_type=0;
4228
4229     memset(h->non_zero_count[mb_xy], 0, 16);
4230     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4231
4232     if(MB_FIELD)
4233         mb_type|= MB_TYPE_INTERLACED;
4234
4235     if( h->slice_type_nos == FF_B_TYPE )
4236     {
4237         // just for fill_caches. pred_direct_motion will set the real mb_type
4238         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4239
4240         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4241         pred_direct_motion(h, &mb_type);
4242         mb_type|= MB_TYPE_SKIP;
4243     }
4244     else
4245     {
4246         int mx, my;
4247         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4248
4249         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4250         pred_pskip_motion(h, &mx, &my);
4251         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4252         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4253     }
4254
4255     write_back_motion(h, mb_type);
4256     s->current_picture.mb_type[mb_xy]= mb_type;
4257     s->current_picture.qscale_table[mb_xy]= s->qscale;
4258     h->slice_table[ mb_xy ]= h->slice_num;
4259     h->prev_mb_skipped= 1;
4260 }
4261
4262 /**
4263  * decodes a macroblock
4264  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4265  */
4266 static int decode_mb_cavlc(H264Context *h){
4267     MpegEncContext * const s = &h->s;
4268     int mb_xy;
4269     int partition_count;
4270     unsigned int mb_type, cbp;
4271     int dct8x8_allowed= h->pps.transform_8x8_mode;
4272
4273     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4274
4275     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4276
4277     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4278     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4279                 down the code */
4280     if(h->slice_type_nos != FF_I_TYPE){
4281         if(s->mb_skip_run==-1)
4282             s->mb_skip_run= get_ue_golomb(&s->gb);
4283
4284         if (s->mb_skip_run--) {
4285             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4286                 if(s->mb_skip_run==0)
4287                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4288                 else
4289                     predict_field_decoding_flag(h);
4290             }
4291             decode_mb_skip(h);
4292             return 0;
4293         }
4294     }
4295     if(FRAME_MBAFF){
4296         if( (s->mb_y&1) == 0 )
4297             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4298     }
4299
4300     h->prev_mb_skipped= 0;
4301
4302     mb_type= get_ue_golomb(&s->gb);
4303     if(h->slice_type_nos == FF_B_TYPE){
4304         if(mb_type < 23){
4305             partition_count= b_mb_type_info[mb_type].partition_count;
4306             mb_type=         b_mb_type_info[mb_type].type;
4307         }else{
4308             mb_type -= 23;
4309             goto decode_intra_mb;
4310         }
4311     }else if(h->slice_type_nos == FF_P_TYPE){
4312         if(mb_type < 5){
4313             partition_count= p_mb_type_info[mb_type].partition_count;
4314             mb_type=         p_mb_type_info[mb_type].type;
4315         }else{
4316             mb_type -= 5;
4317             goto decode_intra_mb;
4318         }
4319     }else{
4320        assert(h->slice_type_nos == FF_I_TYPE);
4321         if(h->slice_type == FF_SI_TYPE && mb_type)
4322             mb_type--;
4323 decode_intra_mb:
4324         if(mb_type > 25){
4325             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4326             return -1;
4327         }
4328         partition_count=0;
4329         cbp= i_mb_type_info[mb_type].cbp;
4330         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4331         mb_type= i_mb_type_info[mb_type].type;
4332     }
4333
4334     if(MB_FIELD)
4335         mb_type |= MB_TYPE_INTERLACED;
4336
4337     h->slice_table[ mb_xy ]= h->slice_num;
4338
4339     if(IS_INTRA_PCM(mb_type)){
4340         unsigned int x;
4341
4342         // We assume these blocks are very rare so we do not optimize it.
4343         align_get_bits(&s->gb);
4344
4345         // The pixels are stored in the same order as levels in h->mb array.
4346         for(x=0; x < (CHROMA ? 384 : 256); x++){
4347             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4348         }
4349
4350         // In deblocking, the quantizer is 0
4351         s->current_picture.qscale_table[mb_xy]= 0;
4352         // All coeffs are present
4353         memset(h->non_zero_count[mb_xy], 16, 16);
4354
4355         s->current_picture.mb_type[mb_xy]= mb_type;
4356         return 0;
4357     }
4358
4359     if(MB_MBAFF){
4360         h->ref_count[0] <<= 1;
4361         h->ref_count[1] <<= 1;
4362     }
4363
4364     fill_caches(h, mb_type, 0);
4365
4366     //mb_pred
4367     if(IS_INTRA(mb_type)){
4368         int pred_mode;
4369 //            init_top_left_availability(h);
4370         if(IS_INTRA4x4(mb_type)){
4371             int i;
4372             int di = 1;
4373             if(dct8x8_allowed && get_bits1(&s->gb)){
4374                 mb_type |= MB_TYPE_8x8DCT;
4375                 di = 4;
4376             }
4377
4378 //                fill_intra4x4_pred_table(h);
4379             for(i=0; i<16; i+=di){
4380                 int mode= pred_intra_mode(h, i);
4381
4382                 if(!get_bits1(&s->gb)){
4383                     const int rem_mode= get_bits(&s->gb, 3);
4384                     mode = rem_mode + (rem_mode >= mode);
4385                 }
4386
4387                 if(di==4)
4388                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4389                 else
4390                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4391             }
4392             write_back_intra_pred_mode(h);
4393             if( check_intra4x4_pred_mode(h) < 0)
4394                 return -1;
4395         }else{
4396             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4397             if(h->intra16x16_pred_mode < 0)
4398                 return -1;
4399         }
4400         if(CHROMA){
4401             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4402             if(pred_mode < 0)
4403                 return -1;
4404             h->chroma_pred_mode= pred_mode;
4405         }
4406     }else if(partition_count==4){
4407         int i, j, sub_partition_count[4], list, ref[2][4];
4408
4409         if(h->slice_type_nos == FF_B_TYPE){
4410             for(i=0; i<4; i++){
4411                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4412                 if(h->sub_mb_type[i] >=13){
4413                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4414                     return -1;
4415                 }
4416                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4417                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4418             }
4419             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4420                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4421                 pred_direct_motion(h, &mb_type);
4422                 h->ref_cache[0][scan8[4]] =
4423                 h->ref_cache[1][scan8[4]] =
4424                 h->ref_cache[0][scan8[12]] =
4425                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4426             }
4427         }else{
4428             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4429             for(i=0; i<4; i++){
4430                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4431                 if(h->sub_mb_type[i] >=4){
4432                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4433                     return -1;
4434                 }
4435                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4436                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4437             }
4438         }
4439
4440         for(list=0; list<h->list_count; list++){
4441             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4442             for(i=0; i<4; i++){
4443                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4444                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4445                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4446                     if(tmp>=ref_count){
4447                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4448                         return -1;
4449                     }
4450                     ref[list][i]= tmp;
4451                 }else{
4452                  //FIXME
4453                     ref[list][i] = -1;
4454                 }
4455             }
4456         }
4457
4458         if(dct8x8_allowed)
4459             dct8x8_allowed = get_dct8x8_allowed(h);
4460
4461         for(list=0; list<h->list_count; list++){
4462             for(i=0; i<4; i++){
4463                 if(IS_DIRECT(h->sub_mb_type[i])) {
4464                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4465                     continue;
4466                 }
4467                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4468                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4469
4470                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4471                     const int sub_mb_type= h->sub_mb_type[i];
4472                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4473                     for(j=0; j<sub_partition_count[i]; j++){
4474                         int mx, my;
4475                         const int index= 4*i + block_width*j;
4476                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4477                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4478                         mx += get_se_golomb(&s->gb);
4479                         my += get_se_golomb(&s->gb);
4480                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4481
4482                         if(IS_SUB_8X8(sub_mb_type)){
4483                             mv_cache[ 1 ][0]=
4484                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4485                             mv_cache[ 1 ][1]=
4486                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4487                         }else if(IS_SUB_8X4(sub_mb_type)){
4488                             mv_cache[ 1 ][0]= mx;
4489                             mv_cache[ 1 ][1]= my;
4490                         }else if(IS_SUB_4X8(sub_mb_type)){
4491                             mv_cache[ 8 ][0]= mx;
4492                             mv_cache[ 8 ][1]= my;
4493                         }
4494                         mv_cache[ 0 ][0]= mx;
4495                         mv_cache[ 0 ][1]= my;
4496                     }
4497                 }else{
4498                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4499                     p[0] = p[1]=
4500                     p[8] = p[9]= 0;
4501                 }
4502             }
4503         }
4504     }else if(IS_DIRECT(mb_type)){
4505         pred_direct_motion(h, &mb_type);
4506         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4507     }else{
4508         int list, mx, my, i;
4509          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4510         if(IS_16X16(mb_type)){
4511             for(list=0; list<h->list_count; list++){
4512                     unsigned int val;
4513                     if(IS_DIR(mb_type, 0, list)){
4514                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4515                         if(val >= h->ref_count[list]){
4516                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4517                             return -1;
4518                         }
4519                     }else
4520                         val= LIST_NOT_USED&0xFF;
4521                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4522             }
4523             for(list=0; list<h->list_count; list++){
4524                 unsigned int val;
4525                 if(IS_DIR(mb_type, 0, list)){
4526                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4527                     mx += get_se_golomb(&s->gb);
4528                     my += get_se_golomb(&s->gb);
4529                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4530
4531                     val= pack16to32(mx,my);
4532                 }else
4533                     val=0;
4534                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4535             }
4536         }
4537         else if(IS_16X8(mb_type)){
4538             for(list=0; list<h->list_count; list++){
4539                     for(i=0; i<2; i++){
4540                         unsigned int val;
4541                         if(IS_DIR(mb_type, i, list)){
4542                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4543                             if(val >= h->ref_count[list]){
4544                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4545                                 return -1;
4546                             }
4547                         }else
4548                             val= LIST_NOT_USED&0xFF;
4549                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4550                     }
4551             }
4552             for(list=0; list<h->list_count; list++){
4553                 for(i=0; i<2; i++){
4554                     unsigned int val;
4555                     if(IS_DIR(mb_type, i, list)){
4556                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4557                         mx += get_se_golomb(&s->gb);
4558                         my += get_se_golomb(&s->gb);
4559                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4560
4561                         val= pack16to32(mx,my);
4562                     }else
4563                         val=0;
4564                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4565                 }
4566             }
4567         }else{
4568             assert(IS_8X16(mb_type));
4569             for(list=0; list<h->list_count; list++){
4570                     for(i=0; i<2; i++){
4571                         unsigned int val;
4572                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4573                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4574                             if(val >= h->ref_count[list]){
4575                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4576                                 return -1;
4577                             }
4578                         }else
4579                             val= LIST_NOT_USED&0xFF;
4580                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4581                     }
4582             }
4583             for(list=0; list<h->list_count; list++){
4584                 for(i=0; i<2; i++){
4585                     unsigned int val;
4586                     if(IS_DIR(mb_type, i, list)){
4587                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4588                         mx += get_se_golomb(&s->gb);
4589                         my += get_se_golomb(&s->gb);
4590                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4591
4592                         val= pack16to32(mx,my);
4593                     }else
4594                         val=0;
4595                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4596                 }
4597             }
4598         }
4599     }
4600
4601     if(IS_INTER(mb_type))
4602         write_back_motion(h, mb_type);
4603
4604     if(!IS_INTRA16x16(mb_type)){
4605         cbp= get_ue_golomb(&s->gb);
4606         if(cbp > 47){
4607             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4608             return -1;
4609         }
4610
4611         if(CHROMA){
4612             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4613             else                     cbp= golomb_to_inter_cbp   [cbp];
4614         }else{
4615             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4616             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4617         }
4618     }
4619     h->cbp = cbp;
4620
4621     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4622         if(get_bits1(&s->gb)){
4623             mb_type |= MB_TYPE_8x8DCT;
4624             h->cbp_table[mb_xy]= cbp;
4625         }
4626     }
4627     s->current_picture.mb_type[mb_xy]= mb_type;
4628
4629     if(cbp || IS_INTRA16x16(mb_type)){
4630         int i8x8, i4x4, chroma_idx;
4631         int dquant;
4632         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4633         const uint8_t *scan, *scan8x8, *dc_scan;
4634
4635 //        fill_non_zero_count_cache(h);
4636
4637         if(IS_INTERLACED(mb_type)){
4638             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4639             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4640             dc_scan= luma_dc_field_scan;
4641         }else{
4642             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4643             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4644             dc_scan= luma_dc_zigzag_scan;
4645         }
4646
4647         dquant= get_se_golomb(&s->gb);
4648
4649         if( dquant > 25 || dquant < -26 ){
4650             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4651             return -1;
4652         }
4653
4654         s->qscale += dquant;
4655         if(((unsigned)s->qscale) > 51){
4656             if(s->qscale<0) s->qscale+= 52;
4657             else            s->qscale-= 52;
4658         }
4659
4660         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4661         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4662         if(IS_INTRA16x16(mb_type)){
4663             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4664                 return -1; //FIXME continue if partitioned and other return -1 too
4665             }
4666
4667             assert((cbp&15) == 0 || (cbp&15) == 15);
4668
4669             if(cbp&15){
4670                 for(i8x8=0; i8x8<4; i8x8++){
4671                     for(i4x4=0; i4x4<4; i4x4++){
4672                         const int index= i4x4 + 4*i8x8;
4673                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4674                             return -1;
4675                         }
4676                     }
4677                 }
4678             }else{
4679                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4680             }
4681         }else{
4682             for(i8x8=0; i8x8<4; i8x8++){
4683                 if(cbp & (1<<i8x8)){
4684                     if(IS_8x8DCT(mb_type)){
4685                         DCTELEM *buf = &h->mb[64*i8x8];
4686                         uint8_t *nnz;
4687                         for(i4x4=0; i4x4<4; i4x4++){
4688                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4689                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4690                                 return -1;
4691                         }
4692                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4693                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4694                     }else{
4695                         for(i4x4=0; i4x4<4; i4x4++){
4696                             const int index= i4x4 + 4*i8x8;
4697
4698                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4699                                 return -1;
4700                             }
4701                         }
4702                     }
4703                 }else{
4704                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4705                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4706                 }
4707             }
4708         }
4709
4710         if(cbp&0x30){
4711             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4712                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4713                     return -1;
4714                 }
4715         }
4716
4717         if(cbp&0x20){
4718             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4719                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4720                 for(i4x4=0; i4x4<4; i4x4++){
4721                     const int index= 16 + 4*chroma_idx + i4x4;
4722                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4723                         return -1;
4724                     }
4725                 }
4726             }
4727         }else{
4728             uint8_t * const nnz= &h->non_zero_count_cache[0];
4729             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4730             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4731         }
4732     }else{
4733         uint8_t * const nnz= &h->non_zero_count_cache[0];
4734         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4735         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4736         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4737     }
4738     s->current_picture.qscale_table[mb_xy]= s->qscale;
4739     write_back_non_zero_count(h);
4740
4741     if(MB_MBAFF){
4742         h->ref_count[0] >>= 1;
4743         h->ref_count[1] >>= 1;
4744     }
4745
4746     return 0;
4747 }
4748
4749 static int decode_cabac_field_decoding_flag(H264Context *h) {
4750     MpegEncContext * const s = &h->s;
4751     const int mb_x = s->mb_x;
4752     const int mb_y = s->mb_y & ~1;
4753     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4754     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4755
4756     unsigned int ctx = 0;
4757
4758     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4759         ctx += 1;
4760     }
4761     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4762         ctx += 1;
4763     }
4764
4765     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4766 }
4767
4768 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4769     uint8_t *state= &h->cabac_state[ctx_base];
4770     int mb_type;
4771
4772     if(intra_slice){
4773         MpegEncContext * const s = &h->s;
4774         const int mba_xy = h->left_mb_xy[0];
4775         const int mbb_xy = h->top_mb_xy;
4776         int ctx=0;
4777         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4778             ctx++;
4779         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4780             ctx++;
4781         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4782             return 0;   /* I4x4 */
4783         state += 2;
4784     }else{
4785         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4786             return 0;   /* I4x4 */
4787     }
4788
4789     if( get_cabac_terminate( &h->cabac ) )
4790         return 25;  /* PCM */
4791
4792     mb_type = 1; /* I16x16 */
4793     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4794     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4795         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4796     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4797     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4798     return mb_type;
4799 }
4800
4801 static int decode_cabac_mb_type( H264Context *h ) {
4802     MpegEncContext * const s = &h->s;
4803
4804     if( h->slice_type_nos == FF_I_TYPE ) {
4805         return decode_cabac_intra_mb_type(h, 3, 1);
4806     } else if( h->slice_type_nos == FF_P_TYPE ) {
4807         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4808             /* P-type */
4809             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4810                 /* P_L0_D16x16, P_8x8 */
4811                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4812             } else {
4813                 /* P_L0_D8x16, P_L0_D16x8 */
4814                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4815             }
4816         } else {
4817             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4818         }
4819     } else if( h->slice_type_nos == FF_B_TYPE ) {
4820         const int mba_xy = h->left_mb_xy[0];
4821         const int mbb_xy = h->top_mb_xy;
4822         int ctx = 0;
4823         int bits;
4824
4825         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4826             ctx++;
4827         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4828             ctx++;
4829
4830         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4831             return 0; /* B_Direct_16x16 */
4832
4833         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4834             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4835         }
4836
4837         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4838         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4839         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4840         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4841         if( bits < 8 )
4842             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4843         else if( bits == 13 ) {
4844             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4845         } else if( bits == 14 )
4846             return 11; /* B_L1_L0_8x16 */
4847         else if( bits == 15 )
4848             return 22; /* B_8x8 */
4849
4850         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4851         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4852     } else {
4853         /* TODO SI/SP frames? */
4854         return -1;
4855     }
4856 }
4857
4858 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4859     MpegEncContext * const s = &h->s;
4860     int mba_xy, mbb_xy;
4861     int ctx = 0;
4862
4863     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4864         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4865         mba_xy = mb_xy - 1;
4866         if( (mb_y&1)
4867             && h->slice_table[mba_xy] == h->slice_num
4868             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4869             mba_xy += s->mb_stride;
4870         if( MB_FIELD ){
4871             mbb_xy = mb_xy - s->mb_stride;
4872             if( !(mb_y&1)
4873                 && h->slice_table[mbb_xy] == h->slice_num
4874                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4875                 mbb_xy -= s->mb_stride;
4876         }else
4877             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4878     }else{
4879         int mb_xy = h->mb_xy;
4880         mba_xy = mb_xy - 1;
4881         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4882     }
4883
4884     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4885         ctx++;
4886     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4887         ctx++;
4888
4889     if( h->slice_type_nos == FF_B_TYPE )
4890         ctx += 13;
4891     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4892 }
4893
4894 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4895     int mode = 0;
4896
4897     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4898         return pred_mode;
4899
4900     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4901     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4902     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4903
4904     if( mode >= pred_mode )
4905         return mode + 1;
4906     else
4907         return mode;
4908 }
4909
4910 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4911     const int mba_xy = h->left_mb_xy[0];
4912     const int mbb_xy = h->top_mb_xy;
4913
4914     int ctx = 0;
4915
4916     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4917     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4918         ctx++;
4919
4920     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4921         ctx++;
4922
4923     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4924         return 0;
4925
4926     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4927         return 1;
4928     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4929         return 2;
4930     else
4931         return 3;
4932 }
4933
4934 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4935     int cbp_b, cbp_a, ctx, cbp = 0;
4936
4937     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4938     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4939
4940     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4941     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4942     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4943     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4944     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4945     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4946     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4947     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4948     return cbp;
4949 }
4950 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4951     int ctx;
4952     int cbp_a, cbp_b;
4953
4954     cbp_a = (h->left_cbp>>4)&0x03;
4955     cbp_b = (h-> top_cbp>>4)&0x03;
4956
4957     ctx = 0;
4958     if( cbp_a > 0 ) ctx++;
4959     if( cbp_b > 0 ) ctx += 2;
4960     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4961         return 0;
4962
4963     ctx = 4;
4964     if( cbp_a == 2 ) ctx++;
4965     if( cbp_b == 2 ) ctx += 2;
4966     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4967 }
4968 static int decode_cabac_mb_dqp( H264Context *h) {
4969     int   ctx = 0;
4970     int   val = 0;
4971
4972     if( h->last_qscale_diff != 0 )
4973         ctx++;
4974
4975     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4976         if( ctx < 2 )
4977             ctx = 2;
4978         else
4979             ctx = 3;
4980         val++;
4981         if(val > 102) //prevent infinite loop
4982             return INT_MIN;
4983     }
4984
4985     if( val&0x01 )
4986         return (val + 1)/2;
4987     else
4988         return -(val + 1)/2;
4989 }
4990 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4991     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4992         return 0;   /* 8x8 */
4993     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4994         return 1;   /* 8x4 */
4995     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4996         return 2;   /* 4x8 */
4997     return 3;       /* 4x4 */
4998 }
4999 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5000     int type;
5001     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5002         return 0;   /* B_Direct_8x8 */
5003     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5004         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5005     type = 3;
5006     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5007         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5008             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5009         type += 4;
5010     }
5011     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5012     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5013     return type;
5014 }
5015
5016 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5017     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5018 }
5019
5020 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5021     int refa = h->ref_cache[list][scan8[n] - 1];
5022     int refb = h->ref_cache[list][scan8[n] - 8];
5023     int ref  = 0;
5024     int ctx  = 0;
5025
5026     if( h->slice_type_nos == FF_B_TYPE) {
5027         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5028             ctx++;
5029         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5030             ctx += 2;
5031     } else {
5032         if( refa > 0 )
5033             ctx++;
5034         if( refb > 0 )
5035             ctx += 2;
5036     }
5037
5038     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5039         ref++;
5040         if( ctx < 4 )
5041             ctx = 4;
5042         else
5043             ctx = 5;
5044         if(ref >= 32 /*h->ref_list[list]*/){
5045             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5046             return 0; //FIXME we should return -1 and check the return everywhere
5047         }
5048     }
5049     return ref;
5050 }
5051
5052 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5053     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5054                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5055     int ctxbase = (l == 0) ? 40 : 47;
5056     int ctx, mvd;
5057
5058     if( amvd < 3 )
5059         ctx = 0;
5060     else if( amvd > 32 )
5061         ctx = 2;
5062     else
5063         ctx = 1;
5064
5065     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5066         return 0;
5067
5068     mvd= 1;
5069     ctx= 3;
5070     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5071         mvd++;
5072         if( ctx < 6 )
5073             ctx++;
5074     }
5075
5076     if( mvd >= 9 ) {
5077         int k = 3;
5078         while( get_cabac_bypass( &h->cabac ) ) {
5079             mvd += 1 << k;
5080             k++;
5081             if(k>24){
5082                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5083                 return INT_MIN;
5084             }
5085         }
5086         while( k-- ) {
5087             if( get_cabac_bypass( &h->cabac ) )
5088                 mvd += 1 << k;
5089         }
5090     }
5091     return get_cabac_bypass_sign( &h->cabac, -mvd );
5092 }
5093
5094 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5095     int nza, nzb;
5096     int ctx = 0;
5097
5098     if( is_dc ) {
5099         if( cat == 0 ) {
5100             nza = h->left_cbp&0x100;
5101             nzb = h-> top_cbp&0x100;
5102         } else {
5103             nza = (h->left_cbp>>(6+idx))&0x01;
5104             nzb = (h-> top_cbp>>(6+idx))&0x01;
5105         }
5106     } else {
5107         if( cat == 4 ) {
5108             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5109             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5110         } else {
5111             assert(cat == 1 || cat == 2);
5112             nza = h->non_zero_count_cache[scan8[idx] - 1];
5113             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5114         }
5115     }
5116
5117     if( nza > 0 )
5118         ctx++;
5119
5120     if( nzb > 0 )
5121         ctx += 2;
5122
5123     return ctx + 4 * cat;
5124 }
5125
5126 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5127     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5128     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5129     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5130     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5131 };
5132
5133 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5134     static const int significant_coeff_flag_offset[2][6] = {
5135       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5136       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5137     };
5138     static const int last_coeff_flag_offset[2][6] = {
5139       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5140       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5141     };
5142     static const int coeff_abs_level_m1_offset[6] = {
5143         227+0, 227+10, 227+20, 227+30, 227+39, 426
5144     };
5145     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5146       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5147         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5148         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5149        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5150       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5151         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5152         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5153         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5154     };
5155     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5156      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5157      * map node ctx => cabac ctx for level=1 */
5158     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5159     /* map node ctx => cabac ctx for level>1 */
5160     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5161     static const uint8_t coeff_abs_level_transition[2][8] = {
5162     /* update node ctx after decoding a level=1 */
5163         { 1, 2, 3, 3, 4, 5, 6, 7 },
5164     /* update node ctx after decoding a level>1 */
5165         { 4, 4, 4, 4, 5, 6, 7, 7 }
5166     };
5167
5168     int index[64];
5169
5170     int av_unused last;
5171     int coeff_count = 0;
5172     int node_ctx = 0;
5173
5174     uint8_t *significant_coeff_ctx_base;
5175     uint8_t *last_coeff_ctx_base;
5176     uint8_t *abs_level_m1_ctx_base;
5177
5178 #ifndef ARCH_X86
5179 #define CABAC_ON_STACK
5180 #endif
5181 #ifdef CABAC_ON_STACK
5182 #define CC &cc
5183     CABACContext cc;
5184     cc.range     = h->cabac.range;
5185     cc.low       = h->cabac.low;
5186     cc.bytestream= h->cabac.bytestream;
5187 #else
5188 #define CC &h->cabac
5189 #endif
5190
5191
5192     /* cat: 0-> DC 16x16  n = 0
5193      *      1-> AC 16x16  n = luma4x4idx
5194      *      2-> Luma4x4   n = luma4x4idx
5195      *      3-> DC Chroma n = iCbCr
5196      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5197      *      5-> Luma8x8   n = 4 * luma8x8idx
5198      */
5199
5200     /* read coded block flag */
5201     if( is_dc || cat != 5 ) {
5202         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5203             if( !is_dc ) {
5204                 if( cat == 4 )
5205                     h->non_zero_count_cache[scan8[16+n]] = 0;
5206                 else
5207                     h->non_zero_count_cache[scan8[n]] = 0;
5208             }
5209
5210 #ifdef CABAC_ON_STACK
5211             h->cabac.range     = cc.range     ;
5212             h->cabac.low       = cc.low       ;
5213             h->cabac.bytestream= cc.bytestream;
5214 #endif
5215             return;
5216         }
5217     }
5218
5219     significant_coeff_ctx_base = h->cabac_state
5220         + significant_coeff_flag_offset[MB_FIELD][cat];
5221     last_coeff_ctx_base = h->cabac_state
5222         + last_coeff_flag_offset[MB_FIELD][cat];
5223     abs_level_m1_ctx_base = h->cabac_state
5224         + coeff_abs_level_m1_offset[cat];
5225
5226     if( !is_dc && cat == 5 ) {
5227 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5228         for(last= 0; last < coefs; last++) { \
5229             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5230             if( get_cabac( CC, sig_ctx )) { \
5231                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5232                 index[coeff_count++] = last; \
5233                 if( get_cabac( CC, last_ctx ) ) { \
5234                     last= max_coeff; \
5235                     break; \
5236                 } \
5237             } \
5238         }\
5239         if( last == max_coeff -1 ) {\
5240             index[coeff_count++] = last;\
5241         }
5242         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5243 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5244         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5245     } else {
5246         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5247 #else
5248         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5249     } else {
5250         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5251 #endif
5252     }
5253     assert(coeff_count > 0);
5254
5255     if( is_dc ) {
5256         if( cat == 0 )
5257             h->cbp_table[h->mb_xy] |= 0x100;
5258         else
5259             h->cbp_table[h->mb_xy] |= 0x40 << n;
5260     } else {
5261         if( cat == 5 )
5262             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5263         else if( cat == 4 )
5264             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5265         else {
5266             assert( cat == 1 || cat == 2 );
5267             h->non_zero_count_cache[scan8[n]] = coeff_count;
5268         }
5269     }
5270
5271     do {
5272         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5273
5274         int j= scantable[index[--coeff_count]];
5275
5276         if( get_cabac( CC, ctx ) == 0 ) {
5277             node_ctx = coeff_abs_level_transition[0][node_ctx];
5278             if( is_dc ) {
5279                 block[j] = get_cabac_bypass_sign( CC, -1);
5280             }else{
5281                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5282             }
5283         } else {
5284             int coeff_abs = 2;
5285             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5286             node_ctx = coeff_abs_level_transition[1][node_ctx];
5287
5288             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5289                 coeff_abs++;
5290             }
5291
5292             if( coeff_abs >= 15 ) {
5293                 int j = 0;
5294                 while( get_cabac_bypass( CC ) ) {
5295                     j++;
5296                 }
5297
5298                 coeff_abs=1;
5299                 while( j-- ) {
5300                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5301                 }
5302                 coeff_abs+= 14;
5303             }
5304
5305             if( is_dc ) {
5306                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5307             }else{
5308                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5309             }
5310         }
5311     } while( coeff_count );
5312 #ifdef CABAC_ON_STACK
5313             h->cabac.range     = cc.range     ;
5314             h->cabac.low       = cc.low       ;
5315             h->cabac.bytestream= cc.bytestream;
5316 #endif
5317
5318 }
5319
5320 #ifndef CONFIG_SMALL
5321 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5322     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5323 }
5324
5325 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5326     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5327 }
5328 #endif
5329
5330 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5331 #ifdef CONFIG_SMALL
5332     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5333 #else
5334     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5335     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5336 #endif
5337 }
5338
5339 static inline void compute_mb_neighbors(H264Context *h)
5340 {
5341     MpegEncContext * const s = &h->s;
5342     const int mb_xy  = h->mb_xy;
5343     h->top_mb_xy     = mb_xy - s->mb_stride;
5344     h->left_mb_xy[0] = mb_xy - 1;
5345     if(FRAME_MBAFF){
5346         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5347         const int top_pair_xy      = pair_xy     - s->mb_stride;
5348         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5349         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5350         const int curr_mb_frame_flag = !MB_FIELD;
5351         const int bottom = (s->mb_y & 1);
5352         if (bottom
5353                 ? !curr_mb_frame_flag // bottom macroblock
5354                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5355                 ) {
5356             h->top_mb_xy -= s->mb_stride;
5357         }
5358         if (left_mb_frame_flag != curr_mb_frame_flag) {
5359             h->left_mb_xy[0] = pair_xy - 1;
5360         }
5361     } else if (FIELD_PICTURE) {
5362         h->top_mb_xy -= s->mb_stride;
5363     }
5364     return;
5365 }
5366
5367 /**
5368  * decodes a macroblock
5369  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5370  */
5371 static int decode_mb_cabac(H264Context *h) {
5372     MpegEncContext * const s = &h->s;
5373     int mb_xy;
5374     int mb_type, partition_count, cbp = 0;
5375     int dct8x8_allowed= h->pps.transform_8x8_mode;
5376
5377     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5378
5379     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5380
5381     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5382     if( h->slice_type_nos != FF_I_TYPE ) {
5383         int skip;
5384         /* a skipped mb needs the aff flag from the following mb */
5385         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5386             predict_field_decoding_flag(h);
5387         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5388             skip = h->next_mb_skipped;
5389         else
5390             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5391         /* read skip flags */
5392         if( skip ) {
5393             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5394                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5395                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5396                 if(h->next_mb_skipped)
5397                     predict_field_decoding_flag(h);
5398                 else
5399                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5400             }
5401
5402             decode_mb_skip(h);
5403
5404             h->cbp_table[mb_xy] = 0;
5405             h->chroma_pred_mode_table[mb_xy] = 0;
5406             h->last_qscale_diff = 0;
5407
5408             return 0;
5409
5410         }
5411     }
5412     if(FRAME_MBAFF){
5413         if( (s->mb_y&1) == 0 )
5414             h->mb_mbaff =
5415             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5416     }
5417
5418     h->prev_mb_skipped = 0;
5419
5420     compute_mb_neighbors(h);
5421     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5422         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5423         return -1;
5424     }
5425
5426     if( h->slice_type_nos == FF_B_TYPE ) {
5427         if( mb_type < 23 ){
5428             partition_count= b_mb_type_info[mb_type].partition_count;
5429             mb_type=         b_mb_type_info[mb_type].type;
5430         }else{
5431             mb_type -= 23;
5432             goto decode_intra_mb;
5433         }
5434     } else if( h->slice_type_nos == FF_P_TYPE ) {
5435         if( mb_type < 5) {
5436             partition_count= p_mb_type_info[mb_type].partition_count;
5437             mb_type=         p_mb_type_info[mb_type].type;
5438         } else {
5439             mb_type -= 5;
5440             goto decode_intra_mb;
5441         }
5442     } else {
5443         if(h->slice_type == FF_SI_TYPE && mb_type)
5444             mb_type--;
5445         assert(h->slice_type_nos == FF_I_TYPE);
5446 decode_intra_mb:
5447         partition_count = 0;
5448         cbp= i_mb_type_info[mb_type].cbp;
5449         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5450         mb_type= i_mb_type_info[mb_type].type;
5451     }
5452     if(MB_FIELD)
5453         mb_type |= MB_TYPE_INTERLACED;
5454
5455     h->slice_table[ mb_xy ]= h->slice_num;
5456
5457     if(IS_INTRA_PCM(mb_type)) {
5458         const uint8_t *ptr;
5459
5460         // We assume these blocks are very rare so we do not optimize it.
5461         // FIXME The two following lines get the bitstream position in the cabac
5462         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5463         ptr= h->cabac.bytestream;
5464         if(h->cabac.low&0x1) ptr--;
5465         if(CABAC_BITS==16){
5466             if(h->cabac.low&0x1FF) ptr--;
5467         }
5468
5469         // The pixels are stored in the same order as levels in h->mb array.
5470         memcpy(h->mb, ptr, 256); ptr+=256;
5471         if(CHROMA){
5472             memcpy(h->mb+128, ptr, 128); ptr+=128;
5473         }
5474
5475         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5476
5477         // All blocks are present
5478         h->cbp_table[mb_xy] = 0x1ef;
5479         h->chroma_pred_mode_table[mb_xy] = 0;
5480         // In deblocking, the quantizer is 0
5481         s->current_picture.qscale_table[mb_xy]= 0;
5482         // All coeffs are present
5483         memset(h->non_zero_count[mb_xy], 16, 16);
5484         s->current_picture.mb_type[mb_xy]= mb_type;
5485         h->last_qscale_diff = 0;
5486         return 0;
5487     }
5488
5489     if(MB_MBAFF){
5490         h->ref_count[0] <<= 1;
5491         h->ref_count[1] <<= 1;
5492     }
5493
5494     fill_caches(h, mb_type, 0);
5495
5496     if( IS_INTRA( mb_type ) ) {
5497         int i, pred_mode;
5498         if( IS_INTRA4x4( mb_type ) ) {
5499             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5500                 mb_type |= MB_TYPE_8x8DCT;
5501                 for( i = 0; i < 16; i+=4 ) {
5502                     int pred = pred_intra_mode( h, i );
5503                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5504                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5505                 }
5506             } else {
5507                 for( i = 0; i < 16; i++ ) {
5508                     int pred = pred_intra_mode( h, i );
5509                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5510
5511                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5512                 }
5513             }
5514             write_back_intra_pred_mode(h);
5515             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5516         } else {
5517             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5518             if( h->intra16x16_pred_mode < 0 ) return -1;
5519         }
5520         if(CHROMA){
5521             h->chroma_pred_mode_table[mb_xy] =
5522             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5523
5524             pred_mode= check_intra_pred_mode( h, pred_mode );
5525             if( pred_mode < 0 ) return -1;
5526             h->chroma_pred_mode= pred_mode;
5527         }
5528     } else if( partition_count == 4 ) {
5529         int i, j, sub_partition_count[4], list, ref[2][4];
5530
5531         if( h->slice_type_nos == FF_B_TYPE ) {
5532             for( i = 0; i < 4; i++ ) {
5533                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5534                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5535                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5536             }
5537             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5538                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5539                 pred_direct_motion(h, &mb_type);
5540                 h->ref_cache[0][scan8[4]] =
5541                 h->ref_cache[1][scan8[4]] =
5542                 h->ref_cache[0][scan8[12]] =
5543                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5544                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5545                     for( i = 0; i < 4; i++ )
5546                         if( IS_DIRECT(h->sub_mb_type[i]) )
5547                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5548                 }
5549             }
5550         } else {
5551             for( i = 0; i < 4; i++ ) {
5552                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5553                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5554                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5555             }
5556         }
5557
5558         for( list = 0; list < h->list_count; list++ ) {
5559                 for( i = 0; i < 4; i++ ) {
5560                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5561                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5562                         if( h->ref_count[list] > 1 )
5563                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5564                         else
5565                             ref[list][i] = 0;
5566                     } else {
5567                         ref[list][i] = -1;
5568                     }
5569                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5570                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5571                 }
5572         }
5573
5574         if(dct8x8_allowed)
5575             dct8x8_allowed = get_dct8x8_allowed(h);
5576
5577         for(list=0; list<h->list_count; list++){
5578             for(i=0; i<4; i++){
5579                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5580                 if(IS_DIRECT(h->sub_mb_type[i])){
5581                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5582                     continue;
5583                 }
5584
5585                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5586                     const int sub_mb_type= h->sub_mb_type[i];
5587                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5588                     for(j=0; j<sub_partition_count[i]; j++){
5589                         int mpx, mpy;
5590                         int mx, my;
5591                         const int index= 4*i + block_width*j;
5592                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5593                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5594                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5595
5596                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5597                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5598                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5599
5600                         if(IS_SUB_8X8(sub_mb_type)){
5601                             mv_cache[ 1 ][0]=
5602                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5603                             mv_cache[ 1 ][1]=
5604                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5605
5606                             mvd_cache[ 1 ][0]=
5607                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5608                             mvd_cache[ 1 ][1]=
5609                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5610                         }else if(IS_SUB_8X4(sub_mb_type)){
5611                             mv_cache[ 1 ][0]= mx;
5612                             mv_cache[ 1 ][1]= my;
5613
5614                             mvd_cache[ 1 ][0]= mx - mpx;
5615                             mvd_cache[ 1 ][1]= my - mpy;
5616                         }else if(IS_SUB_4X8(sub_mb_type)){
5617                             mv_cache[ 8 ][0]= mx;
5618                             mv_cache[ 8 ][1]= my;
5619
5620                             mvd_cache[ 8 ][0]= mx - mpx;
5621                             mvd_cache[ 8 ][1]= my - mpy;
5622                         }
5623                         mv_cache[ 0 ][0]= mx;
5624                         mv_cache[ 0 ][1]= my;
5625
5626                         mvd_cache[ 0 ][0]= mx - mpx;
5627                         mvd_cache[ 0 ][1]= my - mpy;
5628                     }
5629                 }else{
5630                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5631                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5632                     p[0] = p[1] = p[8] = p[9] = 0;
5633                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5634                 }
5635             }
5636         }
5637     } else if( IS_DIRECT(mb_type) ) {
5638         pred_direct_motion(h, &mb_type);
5639         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5640         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5641         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5642     } else {
5643         int list, mx, my, i, mpx, mpy;
5644         if(IS_16X16(mb_type)){
5645             for(list=0; list<h->list_count; list++){
5646                 if(IS_DIR(mb_type, 0, list)){
5647                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5648                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5649                 }else
5650                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5651             }
5652             for(list=0; list<h->list_count; list++){
5653                 if(IS_DIR(mb_type, 0, list)){
5654                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5655
5656                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5657                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5658                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5659
5660                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5661                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5662                 }else
5663                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5664             }
5665         }
5666         else if(IS_16X8(mb_type)){
5667             for(list=0; list<h->list_count; list++){
5668                     for(i=0; i<2; i++){
5669                         if(IS_DIR(mb_type, i, list)){
5670                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5671                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5672                         }else
5673                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5674                     }
5675             }
5676             for(list=0; list<h->list_count; list++){
5677                 for(i=0; i<2; i++){
5678                     if(IS_DIR(mb_type, i, list)){
5679                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5680                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5681                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5682                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5683
5684                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5685                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5686                     }else{
5687                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5688                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5689                     }
5690                 }
5691             }
5692         }else{
5693             assert(IS_8X16(mb_type));
5694             for(list=0; list<h->list_count; list++){
5695                     for(i=0; i<2; i++){
5696                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5697                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5698                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5699                         }else
5700                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5701                     }
5702             }
5703             for(list=0; list<h->list_count; list++){
5704                 for(i=0; i<2; i++){
5705                     if(IS_DIR(mb_type, i, list)){
5706                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5707                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5708                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5709
5710                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5711                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5712                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5713                     }else{
5714                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5715                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5716                     }
5717                 }
5718             }
5719         }
5720     }
5721
5722    if( IS_INTER( mb_type ) ) {
5723         h->chroma_pred_mode_table[mb_xy] = 0;
5724         write_back_motion( h, mb_type );
5725    }
5726
5727     if( !IS_INTRA16x16( mb_type ) ) {
5728         cbp  = decode_cabac_mb_cbp_luma( h );
5729         if(CHROMA)
5730             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5731     }
5732
5733     h->cbp_table[mb_xy] = h->cbp = cbp;
5734
5735     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5736         if( decode_cabac_mb_transform_size( h ) )
5737             mb_type |= MB_TYPE_8x8DCT;
5738     }
5739     s->current_picture.mb_type[mb_xy]= mb_type;
5740
5741     if( cbp || IS_INTRA16x16( mb_type ) ) {
5742         const uint8_t *scan, *scan8x8, *dc_scan;
5743         const uint32_t *qmul;
5744         int dqp;
5745
5746         if(IS_INTERLACED(mb_type)){
5747             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5748             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5749             dc_scan= luma_dc_field_scan;
5750         }else{
5751             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5752             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5753             dc_scan= luma_dc_zigzag_scan;
5754         }
5755
5756         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5757         if( dqp == INT_MIN ){
5758             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5759             return -1;
5760         }
5761         s->qscale += dqp;
5762         if(((unsigned)s->qscale) > 51){
5763             if(s->qscale<0) s->qscale+= 52;
5764             else            s->qscale-= 52;
5765         }
5766         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5767         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5768
5769         if( IS_INTRA16x16( mb_type ) ) {
5770             int i;
5771             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5772             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5773
5774             if( cbp&15 ) {
5775                 qmul = h->dequant4_coeff[0][s->qscale];
5776                 for( i = 0; i < 16; i++ ) {
5777                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5778                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5779                 }
5780             } else {
5781                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5782             }
5783         } else {
5784             int i8x8, i4x4;
5785             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5786                 if( cbp & (1<<i8x8) ) {
5787                     if( IS_8x8DCT(mb_type) ) {
5788                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5789                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5790                     } else {
5791                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5792                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5793                             const int index = 4*i8x8 + i4x4;
5794                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5795 //START_TIMER
5796                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5797 //STOP_TIMER("decode_residual")
5798                         }
5799                     }
5800                 } else {
5801                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5802                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5803                 }
5804             }
5805         }
5806
5807         if( cbp&0x30 ){
5808             int c;
5809             for( c = 0; c < 2; c++ ) {
5810                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5811                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5812             }
5813         }
5814
5815         if( cbp&0x20 ) {
5816             int c, i;
5817             for( c = 0; c < 2; c++ ) {
5818                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5819                 for( i = 0; i < 4; i++ ) {
5820                     const int index = 16 + 4 * c + i;
5821                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5822                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5823                 }
5824             }
5825         } else {
5826             uint8_t * const nnz= &h->non_zero_count_cache[0];
5827             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5828             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5829         }
5830     } else {
5831         uint8_t * const nnz= &h->non_zero_count_cache[0];
5832         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5833         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5834         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5835         h->last_qscale_diff = 0;
5836     }
5837
5838     s->current_picture.qscale_table[mb_xy]= s->qscale;
5839     write_back_non_zero_count(h);
5840
5841     if(MB_MBAFF){
5842         h->ref_count[0] >>= 1;
5843         h->ref_count[1] >>= 1;
5844     }
5845
5846     return 0;
5847 }
5848
5849
5850 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5851     int i, d;
5852     const int index_a = qp + h->slice_alpha_c0_offset;
5853     const int alpha = (alpha_table+52)[index_a];
5854     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5855
5856     if( bS[0] < 4 ) {
5857         int8_t tc[4];
5858         for(i=0; i<4; i++)
5859             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5860         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5861     } else {
5862         /* 16px edge length, because bS=4 is triggered by being at
5863          * the edge of an intra MB, so all 4 bS are the same */
5864             for( d = 0; d < 16; d++ ) {
5865                 const int p0 = pix[-1];
5866                 const int p1 = pix[-2];
5867                 const int p2 = pix[-3];
5868
5869                 const int q0 = pix[0];
5870                 const int q1 = pix[1];
5871                 const int q2 = pix[2];
5872
5873                 if( FFABS( p0 - q0 ) < alpha &&
5874                     FFABS( p1 - p0 ) < beta &&
5875                     FFABS( q1 - q0 ) < beta ) {
5876
5877                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5878                         if( FFABS( p2 - p0 ) < beta)
5879                         {
5880                             const int p3 = pix[-4];
5881                             /* p0', p1', p2' */
5882                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5883                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5884                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5885                         } else {
5886                             /* p0' */
5887                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5888                         }
5889                         if( FFABS( q2 - q0 ) < beta)
5890                         {
5891                             const int q3 = pix[3];
5892                             /* q0', q1', q2' */
5893                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5894                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5895                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5896                         } else {
5897                             /* q0' */
5898                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5899                         }
5900                     }else{
5901                         /* p0', q0' */
5902                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5903                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5904                     }
5905                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5906                 }
5907                 pix += stride;
5908             }
5909     }
5910 }
5911 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5912     int i;
5913     const int index_a = qp + h->slice_alpha_c0_offset;
5914     const int alpha = (alpha_table+52)[index_a];
5915     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5916
5917     if( bS[0] < 4 ) {
5918         int8_t tc[4];
5919         for(i=0; i<4; i++)
5920             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5921         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5922     } else {
5923         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5924     }
5925 }
5926
5927 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5928     int i;
5929     for( i = 0; i < 16; i++, pix += stride) {
5930         int index_a;
5931         int alpha;
5932         int beta;
5933
5934         int qp_index;
5935         int bS_index = (i >> 1);
5936         if (!MB_FIELD) {
5937             bS_index &= ~1;
5938             bS_index |= (i & 1);
5939         }
5940
5941         if( bS[bS_index] == 0 ) {
5942             continue;
5943         }
5944
5945         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5946         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5947         alpha = (alpha_table+52)[index_a];
5948         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5949
5950         if( bS[bS_index] < 4 ) {
5951             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5952             const int p0 = pix[-1];
5953             const int p1 = pix[-2];
5954             const int p2 = pix[-3];
5955             const int q0 = pix[0];
5956             const int q1 = pix[1];
5957             const int q2 = pix[2];
5958
5959             if( FFABS( p0 - q0 ) < alpha &&
5960                 FFABS( p1 - p0 ) < beta &&
5961                 FFABS( q1 - q0 ) < beta ) {
5962                 int tc = tc0;
5963                 int i_delta;
5964
5965                 if( FFABS( p2 - p0 ) < beta ) {
5966                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5967                     tc++;
5968                 }
5969                 if( FFABS( q2 - q0 ) < beta ) {
5970                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5971                     tc++;
5972                 }
5973
5974                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5975                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5976                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5977                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5978             }
5979         }else{
5980             const int p0 = pix[-1];
5981             const int p1 = pix[-2];
5982             const int p2 = pix[-3];
5983
5984             const int q0 = pix[0];
5985             const int q1 = pix[1];
5986             const int q2 = pix[2];
5987
5988             if( FFABS( p0 - q0 ) < alpha &&
5989                 FFABS( p1 - p0 ) < beta &&
5990                 FFABS( q1 - q0 ) < beta ) {
5991
5992                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5993                     if( FFABS( p2 - p0 ) < beta)
5994                     {
5995                         const int p3 = pix[-4];
5996                         /* p0', p1', p2' */
5997                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5998                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5999                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6000                     } else {
6001                         /* p0' */
6002                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6003                     }
6004                     if( FFABS( q2 - q0 ) < beta)
6005                     {
6006                         const int q3 = pix[3];
6007                         /* q0', q1', q2' */
6008                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6009                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6010                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6011                     } else {
6012                         /* q0' */
6013                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6014                     }
6015                 }else{
6016                     /* p0', q0' */
6017                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6018                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6019                 }
6020                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6021             }
6022         }
6023     }
6024 }
6025 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6026     int i;
6027     for( i = 0; i < 8; i++, pix += stride) {
6028         int index_a;
6029         int alpha;
6030         int beta;
6031
6032         int qp_index;
6033         int bS_index = i;
6034
6035         if( bS[bS_index] == 0 ) {
6036             continue;
6037         }
6038
6039         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6040         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6041         alpha = (alpha_table+52)[index_a];
6042         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6043
6044         if( bS[bS_index] < 4 ) {
6045             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6046             const int p0 = pix[-1];
6047             const int p1 = pix[-2];
6048             const int q0 = pix[0];
6049             const int q1 = pix[1];
6050
6051             if( FFABS( p0 - q0 ) < alpha &&
6052                 FFABS( p1 - p0 ) < beta &&
6053                 FFABS( q1 - q0 ) < beta ) {
6054                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6055
6056                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6057                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6058                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6059             }
6060         }else{
6061             const int p0 = pix[-1];
6062             const int p1 = pix[-2];
6063             const int q0 = pix[0];
6064             const int q1 = pix[1];
6065
6066             if( FFABS( p0 - q0 ) < alpha &&
6067                 FFABS( p1 - p0 ) < beta &&
6068                 FFABS( q1 - q0 ) < beta ) {
6069
6070                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6071                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6072                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6073             }
6074         }
6075     }
6076 }
6077
6078 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6079     int i, d;
6080     const int index_a = qp + h->slice_alpha_c0_offset;
6081     const int alpha = (alpha_table+52)[index_a];
6082     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6083     const int pix_next  = stride;
6084
6085     if( bS[0] < 4 ) {
6086         int8_t tc[4];
6087         for(i=0; i<4; i++)
6088             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6089         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6090     } else {
6091         /* 16px edge length, see filter_mb_edgev */
6092             for( d = 0; d < 16; d++ ) {
6093                 const int p0 = pix[-1*pix_next];
6094                 const int p1 = pix[-2*pix_next];
6095                 const int p2 = pix[-3*pix_next];
6096                 const int q0 = pix[0];
6097                 const int q1 = pix[1*pix_next];
6098                 const int q2 = pix[2*pix_next];
6099
6100                 if( FFABS( p0 - q0 ) < alpha &&
6101                     FFABS( p1 - p0 ) < beta &&
6102                     FFABS( q1 - q0 ) < beta ) {
6103
6104                     const int p3 = pix[-4*pix_next];
6105                     const int q3 = pix[ 3*pix_next];
6106
6107                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6108                         if( FFABS( p2 - p0 ) < beta) {
6109                             /* p0', p1', p2' */
6110                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6111                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6112                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6113                         } else {
6114                             /* p0' */
6115                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6116                         }
6117                         if( FFABS( q2 - q0 ) < beta) {
6118                             /* q0', q1', q2' */
6119                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6120                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6121                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6122                         } else {
6123                             /* q0' */
6124                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6125                         }
6126                     }else{
6127                         /* p0', q0' */
6128                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6129                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6130                     }
6131                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6132                 }
6133                 pix++;
6134             }
6135     }
6136 }
6137
6138 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6139     int i;
6140     const int index_a = qp + h->slice_alpha_c0_offset;
6141     const int alpha = (alpha_table+52)[index_a];
6142     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6143
6144     if( bS[0] < 4 ) {
6145         int8_t tc[4];
6146         for(i=0; i<4; i++)
6147             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6148         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6149     } else {
6150         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6151     }
6152 }
6153
6154 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6155     MpegEncContext * const s = &h->s;
6156     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6157     int mb_xy, mb_type;
6158     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6159
6160     mb_xy = h->mb_xy;
6161
6162     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6163 1 ||
6164        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6165                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6166         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6167         return;
6168     }
6169     assert(!FRAME_MBAFF);
6170
6171     mb_type = s->current_picture.mb_type[mb_xy];
6172     qp = s->current_picture.qscale_table[mb_xy];
6173     qp0 = s->current_picture.qscale_table[mb_xy-1];
6174     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6175     qpc = get_chroma_qp( h, 0, qp );
6176     qpc0 = get_chroma_qp( h, 0, qp0 );
6177     qpc1 = get_chroma_qp( h, 0, qp1 );
6178     qp0 = (qp + qp0 + 1) >> 1;
6179     qp1 = (qp + qp1 + 1) >> 1;
6180     qpc0 = (qpc + qpc0 + 1) >> 1;
6181     qpc1 = (qpc + qpc1 + 1) >> 1;
6182     qp_thresh = 15 - h->slice_alpha_c0_offset;
6183     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6184        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6185         return;
6186
6187     if( IS_INTRA(mb_type) ) {
6188         int16_t bS4[4] = {4,4,4,4};
6189         int16_t bS3[4] = {3,3,3,3};
6190         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6191         if( IS_8x8DCT(mb_type) ) {
6192             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6193             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6194             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6195             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6196         } else {
6197             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6198             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6199             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6200             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6201             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6202             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6203             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6204             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6205         }
6206         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6207         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6208         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6209         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6210         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6211         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6212         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6213         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6214         return;
6215     } else {
6216         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6217         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6218         int edges;
6219         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6220             edges = 4;
6221             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6222         } else {
6223             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6224                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6225             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6226                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6227                              ? 3 : 0;
6228             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6229             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6230             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6231                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6232         }
6233         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6234             bSv[0][0] = 0x0004000400040004ULL;
6235         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6236             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6237
6238 #define FILTER(hv,dir,edge)\
6239         if(bSv[dir][edge]) {\
6240             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6241             if(!(edge&1)) {\
6242                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6243                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6244             }\
6245         }
6246         if( edges == 1 ) {
6247             FILTER(v,0,0);
6248             FILTER(h,1,0);
6249         } else if( IS_8x8DCT(mb_type) ) {
6250             FILTER(v,0,0);
6251             FILTER(v,0,2);
6252             FILTER(h,1,0);
6253             FILTER(h,1,2);
6254         } else {
6255             FILTER(v,0,0);
6256             FILTER(v,0,1);
6257             FILTER(v,0,2);
6258             FILTER(v,0,3);
6259             FILTER(h,1,0);
6260             FILTER(h,1,1);
6261             FILTER(h,1,2);
6262             FILTER(h,1,3);
6263         }
6264 #undef FILTER
6265     }
6266 }
6267
6268 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6269     MpegEncContext * const s = &h->s;
6270     const int mb_xy= mb_x + mb_y*s->mb_stride;
6271     const int mb_type = s->current_picture.mb_type[mb_xy];
6272     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6273     int first_vertical_edge_done = 0;
6274     int dir;
6275
6276     //for sufficiently low qp, filtering wouldn't do anything
6277     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6278     if(!FRAME_MBAFF){
6279         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6280         int qp = s->current_picture.qscale_table[mb_xy];
6281         if(qp <= qp_thresh
6282            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6283            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6284             return;
6285         }
6286     }
6287
6288     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6289     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6290         int top_type, left_type[2];
6291         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6292         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6293         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6294
6295         if(IS_8x8DCT(top_type)){
6296             h->non_zero_count_cache[4+8*0]=
6297             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6298             h->non_zero_count_cache[6+8*0]=
6299             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6300         }
6301         if(IS_8x8DCT(left_type[0])){
6302             h->non_zero_count_cache[3+8*1]=
6303             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6304         }
6305         if(IS_8x8DCT(left_type[1])){
6306             h->non_zero_count_cache[3+8*3]=
6307             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6308         }
6309
6310         if(IS_8x8DCT(mb_type)){
6311             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6312             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6313
6314             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6315             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6316
6317             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6318             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6319
6320             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6321             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6322         }
6323     }
6324
6325     if (FRAME_MBAFF
6326             // left mb is in picture
6327             && h->slice_table[mb_xy-1] != 255
6328             // and current and left pair do not have the same interlaced type
6329             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6330             // and left mb is in the same slice if deblocking_filter == 2
6331             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6332         /* First vertical edge is different in MBAFF frames
6333          * There are 8 different bS to compute and 2 different Qp
6334          */
6335         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6336         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6337         int16_t bS[8];
6338         int qp[2];
6339         int bqp[2];
6340         int rqp[2];
6341         int mb_qp, mbn0_qp, mbn1_qp;
6342         int i;
6343         first_vertical_edge_done = 1;
6344
6345         if( IS_INTRA(mb_type) )
6346             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6347         else {
6348             for( i = 0; i < 8; i++ ) {
6349                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6350
6351                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6352                     bS[i] = 4;
6353                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6354                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6355                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6356                     bS[i] = 2;
6357                 else
6358                     bS[i] = 1;
6359             }
6360         }
6361
6362         mb_qp = s->current_picture.qscale_table[mb_xy];
6363         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6364         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6365         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6366         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6367                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6368         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6369                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6370         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6371         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6372                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6373         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6374                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6375
6376         /* Filter edge */
6377         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6378         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6379         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6380         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6381         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6382     }
6383     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6384     for( dir = 0; dir < 2; dir++ )
6385     {
6386         int edge;
6387         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6388         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6389         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &15 ][0] + (MB_MBAFF ? 20 : 2);
6390         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&15 ][0] + (MB_MBAFF ? 20 : 2);
6391         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6392
6393         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6394                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6395         // how often to recheck mv-based bS when iterating between edges
6396         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6397                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6398         // how often to recheck mv-based bS when iterating along each edge
6399         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6400
6401         if (first_vertical_edge_done) {
6402             start = 1;
6403             first_vertical_edge_done = 0;
6404         }
6405
6406         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6407             start = 1;
6408
6409         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6410             && !IS_INTERLACED(mb_type)
6411             && IS_INTERLACED(mbm_type)
6412             ) {
6413             // This is a special case in the norm where the filtering must
6414             // be done twice (one each of the field) even if we are in a
6415             // frame macroblock.
6416             //
6417             static const int nnz_idx[4] = {4,5,6,3};
6418             unsigned int tmp_linesize   = 2 *   linesize;
6419             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6420             int mbn_xy = mb_xy - 2 * s->mb_stride;
6421             int qp;
6422             int i, j;
6423             int16_t bS[4];
6424
6425             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6426                 if( IS_INTRA(mb_type) ||
6427                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6428                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6429                 } else {
6430                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6431                     for( i = 0; i < 4; i++ ) {
6432                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6433                             mbn_nnz[nnz_idx[i]] != 0 )
6434                             bS[i] = 2;
6435                         else
6436                             bS[i] = 1;
6437                     }
6438                 }
6439                 // Do not use s->qscale as luma quantizer because it has not the same
6440                 // value in IPCM macroblocks.
6441                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6442                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6443                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6444                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6445                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6446                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6447                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6448                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6449             }
6450
6451             start = 1;
6452         }
6453
6454         /* Calculate bS */
6455         for( edge = start; edge < edges; edge++ ) {
6456             /* mbn_xy: neighbor macroblock */
6457             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6458             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6459             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6460             int16_t bS[4];
6461             int qp;
6462
6463             if( (edge&1) && IS_8x8DCT(mb_type) )
6464                 continue;
6465
6466             if( IS_INTRA(mb_type) ||
6467                 IS_INTRA(mbn_type) ) {
6468                 int value;
6469                 if (edge == 0) {
6470                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6471                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6472                     ) {
6473                         value = 4;
6474                     } else {
6475                         value = 3;
6476                     }
6477                 } else {
6478                     value = 3;
6479                 }
6480                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6481             } else {
6482                 int i, l;
6483                 int mv_done;
6484
6485                 if( edge & mask_edge ) {
6486                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6487                     mv_done = 1;
6488                 }
6489                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6490                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6491                     mv_done = 1;
6492                 }
6493                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6494                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6495                     int bn_idx= b_idx - (dir ? 8:1);
6496                     int v = 0;
6497
6498                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6499                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6500                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6501                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6502                     }
6503
6504                     if(h->slice_type_nos == FF_B_TYPE && v){
6505                         v=0;
6506                         for( l = 0; !v && l < 2; l++ ) {
6507                             int ln= 1-l;
6508                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6509                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6510                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6511                         }
6512                     }
6513
6514                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6515                     mv_done = 1;
6516                 }
6517                 else
6518                     mv_done = 0;
6519
6520                 for( i = 0; i < 4; i++ ) {
6521                     int x = dir == 0 ? edge : i;
6522                     int y = dir == 0 ? i    : edge;
6523                     int b_idx= 8 + 4 + x + 8*y;
6524                     int bn_idx= b_idx - (dir ? 8:1);
6525
6526                     if( h->non_zero_count_cache[b_idx] != 0 ||
6527                         h->non_zero_count_cache[bn_idx] != 0 ) {
6528                         bS[i] = 2;
6529                     }
6530                     else if(!mv_done)
6531                     {
6532                         bS[i] = 0;
6533                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6534                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6535                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6536                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6537                                 bS[i] = 1;
6538                                 break;
6539                             }
6540                         }
6541
6542                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6543                             bS[i] = 0;
6544                             for( l = 0; l < 2; l++ ) {
6545                                 int ln= 1-l;
6546                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6547                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6548                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6549                                     bS[i] = 1;
6550                                     break;
6551                                 }
6552                             }
6553                         }
6554                     }
6555                 }
6556
6557                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6558                     continue;
6559             }
6560
6561             /* Filter edge */
6562             // Do not use s->qscale as luma quantizer because it has not the same
6563             // value in IPCM macroblocks.
6564             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6565             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6566             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6567             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6568             if( dir == 0 ) {
6569                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6570                 if( (edge&1) == 0 ) {
6571                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6572                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6573                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6574                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6575                 }
6576             } else {
6577                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6578                 if( (edge&1) == 0 ) {
6579                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6580                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6581                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6582                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6583                 }
6584             }
6585         }
6586     }
6587 }
6588
6589 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6590     MpegEncContext * const s = &h->s;
6591     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6592
6593     s->mb_skip_run= -1;
6594
6595     if( h->pps.cabac ) {
6596         int i;
6597
6598         /* realign */
6599         align_get_bits( &s->gb );
6600
6601         /* init cabac */
6602         ff_init_cabac_states( &h->cabac);
6603         ff_init_cabac_decoder( &h->cabac,
6604                                s->gb.buffer + get_bits_count(&s->gb)/8,
6605                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6606         /* calculate pre-state */
6607         for( i= 0; i < 460; i++ ) {
6608             int pre;
6609             if( h->slice_type_nos == FF_I_TYPE )
6610                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6611             else
6612                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6613
6614             if( pre <= 63 )
6615                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6616             else
6617                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6618         }
6619
6620         for(;;){
6621 //START_TIMER
6622             int ret = decode_mb_cabac(h);
6623             int eos;
6624 //STOP_TIMER("decode_mb_cabac")
6625
6626             if(ret>=0) hl_decode_mb(h);
6627
6628             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6629                 s->mb_y++;
6630
6631                 if(ret>=0) ret = decode_mb_cabac(h);
6632
6633                 if(ret>=0) hl_decode_mb(h);
6634                 s->mb_y--;
6635             }
6636             eos = get_cabac_terminate( &h->cabac );
6637
6638             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6639                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6640                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6641                 return -1;
6642             }
6643
6644             if( ++s->mb_x >= s->mb_width ) {
6645                 s->mb_x = 0;
6646                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6647                 ++s->mb_y;
6648                 if(FIELD_OR_MBAFF_PICTURE) {
6649                     ++s->mb_y;
6650                 }
6651             }
6652
6653             if( eos || s->mb_y >= s->mb_height ) {
6654                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6655                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6656                 return 0;
6657             }
6658         }
6659
6660     } else {
6661         for(;;){
6662             int ret = decode_mb_cavlc(h);
6663
6664             if(ret>=0) hl_decode_mb(h);
6665
6666             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6667                 s->mb_y++;
6668                 ret = decode_mb_cavlc(h);
6669
6670                 if(ret>=0) hl_decode_mb(h);
6671                 s->mb_y--;
6672             }
6673
6674             if(ret<0){
6675                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6676                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6677
6678                 return -1;
6679             }
6680
6681             if(++s->mb_x >= s->mb_width){
6682                 s->mb_x=0;
6683                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6684                 ++s->mb_y;
6685                 if(FIELD_OR_MBAFF_PICTURE) {
6686                     ++s->mb_y;
6687                 }
6688                 if(s->mb_y >= s->mb_height){
6689                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6690
6691                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6692                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6693
6694                         return 0;
6695                     }else{
6696                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6697
6698                         return -1;
6699                     }
6700                 }
6701             }
6702
6703             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6704                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6705                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6706                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6707
6708                     return 0;
6709                 }else{
6710                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6711
6712                     return -1;
6713                 }
6714             }
6715         }
6716     }
6717
6718 #if 0
6719     for(;s->mb_y < s->mb_height; s->mb_y++){
6720         for(;s->mb_x < s->mb_width; s->mb_x++){
6721             int ret= decode_mb(h);
6722
6723             hl_decode_mb(h);
6724
6725             if(ret<0){
6726                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6727                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6728
6729                 return -1;
6730             }
6731
6732             if(++s->mb_x >= s->mb_width){
6733                 s->mb_x=0;
6734                 if(++s->mb_y >= s->mb_height){
6735                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6736                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6737
6738                         return 0;
6739                     }else{
6740                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6741
6742                         return -1;
6743                     }
6744                 }
6745             }
6746
6747             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6748                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6749                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6750
6751                     return 0;
6752                 }else{
6753                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6754
6755                     return -1;
6756                 }
6757             }
6758         }
6759         s->mb_x=0;
6760         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6761     }
6762 #endif
6763     return -1; //not reached
6764 }
6765
6766 static int decode_unregistered_user_data(H264Context *h, int size){
6767     MpegEncContext * const s = &h->s;
6768     uint8_t user_data[16+256];
6769     int e, build, i;
6770
6771     if(size<16)
6772         return -1;
6773
6774     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6775         user_data[i]= get_bits(&s->gb, 8);
6776     }
6777
6778     user_data[i]= 0;
6779     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6780     if(e==1 && build>=0)
6781         h->x264_build= build;
6782
6783     if(s->avctx->debug & FF_DEBUG_BUGS)
6784         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6785
6786     for(; i<size; i++)
6787         skip_bits(&s->gb, 8);
6788
6789     return 0;
6790 }
6791
6792 static int decode_sei(H264Context *h){
6793     MpegEncContext * const s = &h->s;
6794
6795     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6796         int size, type;
6797
6798         type=0;
6799         do{
6800             type+= show_bits(&s->gb, 8);
6801         }while(get_bits(&s->gb, 8) == 255);
6802
6803         size=0;
6804         do{
6805             size+= show_bits(&s->gb, 8);
6806         }while(get_bits(&s->gb, 8) == 255);
6807
6808         switch(type){
6809         case 5:
6810             if(decode_unregistered_user_data(h, size) < 0)
6811                 return -1;
6812             break;
6813         default:
6814             skip_bits(&s->gb, 8*size);
6815         }
6816
6817         //FIXME check bits here
6818         align_get_bits(&s->gb);
6819     }
6820
6821     return 0;
6822 }
6823
6824 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6825     MpegEncContext * const s = &h->s;
6826     int cpb_count, i;
6827     cpb_count = get_ue_golomb(&s->gb) + 1;
6828     get_bits(&s->gb, 4); /* bit_rate_scale */
6829     get_bits(&s->gb, 4); /* cpb_size_scale */
6830     for(i=0; i<cpb_count; i++){
6831         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6832         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6833         get_bits1(&s->gb);     /* cbr_flag */
6834     }
6835     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6836     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6837     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6838     get_bits(&s->gb, 5); /* time_offset_length */
6839 }
6840
6841 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6842     MpegEncContext * const s = &h->s;
6843     int aspect_ratio_info_present_flag;
6844     unsigned int aspect_ratio_idc;
6845     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6846
6847     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6848
6849     if( aspect_ratio_info_present_flag ) {
6850         aspect_ratio_idc= get_bits(&s->gb, 8);
6851         if( aspect_ratio_idc == EXTENDED_SAR ) {
6852             sps->sar.num= get_bits(&s->gb, 16);
6853             sps->sar.den= get_bits(&s->gb, 16);
6854         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6855             sps->sar=  pixel_aspect[aspect_ratio_idc];
6856         }else{
6857             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6858             return -1;
6859         }
6860     }else{
6861         sps->sar.num=
6862         sps->sar.den= 0;
6863     }
6864 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6865
6866     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6867         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6868     }
6869
6870     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6871         get_bits(&s->gb, 3);    /* video_format */
6872         get_bits1(&s->gb);      /* video_full_range_flag */
6873         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6874             get_bits(&s->gb, 8); /* colour_primaries */
6875             get_bits(&s->gb, 8); /* transfer_characteristics */
6876             get_bits(&s->gb, 8); /* matrix_coefficients */
6877         }
6878     }
6879
6880     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6881         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6882         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6883     }
6884
6885     sps->timing_info_present_flag = get_bits1(&s->gb);
6886     if(sps->timing_info_present_flag){
6887         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6888         sps->time_scale = get_bits_long(&s->gb, 32);
6889         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6890     }
6891
6892     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6893     if(nal_hrd_parameters_present_flag)
6894         decode_hrd_parameters(h, sps);
6895     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6896     if(vcl_hrd_parameters_present_flag)
6897         decode_hrd_parameters(h, sps);
6898     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6899         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6900     get_bits1(&s->gb);         /* pic_struct_present_flag */
6901
6902     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6903     if(sps->bitstream_restriction_flag){
6904         unsigned int num_reorder_frames;
6905         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6906         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6907         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6908         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6909         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6910         num_reorder_frames= get_ue_golomb(&s->gb);
6911         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6912
6913         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6914             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6915             return -1;
6916         }
6917
6918         sps->num_reorder_frames= num_reorder_frames;
6919     }
6920
6921     return 0;
6922 }
6923
6924 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6925                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6926     MpegEncContext * const s = &h->s;
6927     int i, last = 8, next = 8;
6928     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6929     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6930         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6931     else
6932     for(i=0;i<size;i++){
6933         if(next)
6934             next = (last + get_se_golomb(&s->gb)) & 0xff;
6935         if(!i && !next){ /* matrix not written, we use the preset one */
6936             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6937             break;
6938         }
6939         last = factors[scan[i]] = next ? next : last;
6940     }
6941 }
6942
6943 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6944                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6945     MpegEncContext * const s = &h->s;
6946     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6947     const uint8_t *fallback[4] = {
6948         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6949         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6950         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6951         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6952     };
6953     if(get_bits1(&s->gb)){
6954         sps->scaling_matrix_present |= is_sps;
6955         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6956         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6957         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6958         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6959         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6960         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6961         if(is_sps || pps->transform_8x8_mode){
6962             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6963             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6964         }
6965     } else if(fallback_sps) {
6966         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
6967         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
6968     }
6969 }
6970
6971 /**
6972  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
6973  */
6974 static void *
6975 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
6976                     const size_t size, const char *name)
6977 {
6978     if(id>=max) {
6979         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
6980         return NULL;
6981     }
6982
6983     if(!vec[id]) {
6984         vec[id] = av_mallocz(size);
6985         if(vec[id] == NULL)
6986             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
6987     }
6988     return vec[id];
6989 }
6990
6991 static inline int decode_seq_parameter_set(H264Context *h){
6992     MpegEncContext * const s = &h->s;
6993     int profile_idc, level_idc;
6994     unsigned int sps_id, tmp, mb_width, mb_height;
6995     int i;
6996     SPS *sps;
6997
6998     profile_idc= get_bits(&s->gb, 8);
6999     get_bits1(&s->gb);   //constraint_set0_flag
7000     get_bits1(&s->gb);   //constraint_set1_flag
7001     get_bits1(&s->gb);   //constraint_set2_flag
7002     get_bits1(&s->gb);   //constraint_set3_flag
7003     get_bits(&s->gb, 4); // reserved
7004     level_idc= get_bits(&s->gb, 8);
7005     sps_id= get_ue_golomb(&s->gb);
7006
7007     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7008     if(sps == NULL)
7009         return -1;
7010
7011     sps->profile_idc= profile_idc;
7012     sps->level_idc= level_idc;
7013
7014     if(sps->profile_idc >= 100){ //high profile
7015         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7016         if(sps->chroma_format_idc == 3)
7017             get_bits1(&s->gb);  //residual_color_transform_flag
7018         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7019         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7020         sps->transform_bypass = get_bits1(&s->gb);
7021         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7022     }else{
7023         sps->scaling_matrix_present = 0;
7024         sps->chroma_format_idc= 1;
7025     }
7026
7027     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7028     sps->poc_type= get_ue_golomb(&s->gb);
7029
7030     if(sps->poc_type == 0){ //FIXME #define
7031         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7032     } else if(sps->poc_type == 1){//FIXME #define
7033         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7034         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7035         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7036         tmp= get_ue_golomb(&s->gb);
7037
7038         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7039             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7040             return -1;
7041         }
7042         sps->poc_cycle_length= tmp;
7043
7044         for(i=0; i<sps->poc_cycle_length; i++)
7045             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7046     }else if(sps->poc_type != 2){
7047         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7048         return -1;
7049     }
7050
7051     tmp= get_ue_golomb(&s->gb);
7052     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7053         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7054         return -1;
7055     }
7056     sps->ref_frame_count= tmp;
7057     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7058     mb_width= get_ue_golomb(&s->gb) + 1;
7059     mb_height= get_ue_golomb(&s->gb) + 1;
7060     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7061        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7062         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7063         return -1;
7064     }
7065     sps->mb_width = mb_width;
7066     sps->mb_height= mb_height;
7067
7068     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7069     if(!sps->frame_mbs_only_flag)
7070         sps->mb_aff= get_bits1(&s->gb);
7071     else
7072         sps->mb_aff= 0;
7073
7074     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7075
7076 #ifndef ALLOW_INTERLACE
7077     if(sps->mb_aff)
7078         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7079 #endif
7080     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7081         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7082
7083     sps->crop= get_bits1(&s->gb);
7084     if(sps->crop){
7085         sps->crop_left  = get_ue_golomb(&s->gb);
7086         sps->crop_right = get_ue_golomb(&s->gb);
7087         sps->crop_top   = get_ue_golomb(&s->gb);
7088         sps->crop_bottom= get_ue_golomb(&s->gb);
7089         if(sps->crop_left || sps->crop_top){
7090             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7091         }
7092         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7093             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7094         }
7095     }else{
7096         sps->crop_left  =
7097         sps->crop_right =
7098         sps->crop_top   =
7099         sps->crop_bottom= 0;
7100     }
7101
7102     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7103     if( sps->vui_parameters_present_flag )
7104         decode_vui_parameters(h, sps);
7105
7106     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7107         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7108                sps_id, sps->profile_idc, sps->level_idc,
7109                sps->poc_type,
7110                sps->ref_frame_count,
7111                sps->mb_width, sps->mb_height,
7112                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7113                sps->direct_8x8_inference_flag ? "8B8" : "",
7114                sps->crop_left, sps->crop_right,
7115                sps->crop_top, sps->crop_bottom,
7116                sps->vui_parameters_present_flag ? "VUI" : "",
7117                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7118                );
7119     }
7120     return 0;
7121 }
7122
7123 static void
7124 build_qp_table(PPS *pps, int t, int index)
7125 {
7126     int i;
7127     for(i = 0; i < 52; i++)
7128         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7129 }
7130
7131 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7132     MpegEncContext * const s = &h->s;
7133     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7134     PPS *pps;
7135
7136     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7137     if(pps == NULL)
7138         return -1;
7139
7140     tmp= get_ue_golomb(&s->gb);
7141     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7142         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7143         return -1;
7144     }
7145     pps->sps_id= tmp;
7146
7147     pps->cabac= get_bits1(&s->gb);
7148     pps->pic_order_present= get_bits1(&s->gb);
7149     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7150     if(pps->slice_group_count > 1 ){
7151         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7152         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7153         switch(pps->mb_slice_group_map_type){
7154         case 0:
7155 #if 0
7156 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7157 |    run_length[ i ]                                |1  |ue(v)   |
7158 #endif
7159             break;
7160         case 2:
7161 #if 0
7162 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7163 |{                                                  |   |        |
7164 |    top_left_mb[ i ]                               |1  |ue(v)   |
7165 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7166 |   }                                               |   |        |
7167 #endif
7168             break;
7169         case 3:
7170         case 4:
7171         case 5:
7172 #if 0
7173 |   slice_group_change_direction_flag               |1  |u(1)    |
7174 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7175 #endif
7176             break;
7177         case 6:
7178 #if 0
7179 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7180 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7181 |)                                                  |   |        |
7182 |    slice_group_id[ i ]                            |1  |u(v)    |
7183 #endif
7184             break;
7185         }
7186     }
7187     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7188     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7189     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7190         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7191         pps->ref_count[0]= pps->ref_count[1]= 1;
7192         return -1;
7193     }
7194
7195     pps->weighted_pred= get_bits1(&s->gb);
7196     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7197     pps->init_qp= get_se_golomb(&s->gb) + 26;
7198     pps->init_qs= get_se_golomb(&s->gb) + 26;
7199     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7200     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7201     pps->constrained_intra_pred= get_bits1(&s->gb);
7202     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7203
7204     pps->transform_8x8_mode= 0;
7205     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7206     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7207     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7208
7209     if(get_bits_count(&s->gb) < bit_length){
7210         pps->transform_8x8_mode= get_bits1(&s->gb);
7211         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7212         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7213     } else {
7214         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7215     }
7216
7217     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7218     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7219     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7220         h->pps.chroma_qp_diff= 1;
7221
7222     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7223         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7224                pps_id, pps->sps_id,
7225                pps->cabac ? "CABAC" : "CAVLC",
7226                pps->slice_group_count,
7227                pps->ref_count[0], pps->ref_count[1],
7228                pps->weighted_pred ? "weighted" : "",
7229                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7230                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7231                pps->constrained_intra_pred ? "CONSTR" : "",
7232                pps->redundant_pic_cnt_present ? "REDU" : "",
7233                pps->transform_8x8_mode ? "8x8DCT" : ""
7234                );
7235     }
7236
7237     return 0;
7238 }
7239
7240 /**
7241  * Call decode_slice() for each context.
7242  *
7243  * @param h h264 master context
7244  * @param context_count number of contexts to execute
7245  */
7246 static void execute_decode_slices(H264Context *h, int context_count){
7247     MpegEncContext * const s = &h->s;
7248     AVCodecContext * const avctx= s->avctx;
7249     H264Context *hx;
7250     int i;
7251
7252     if(context_count == 1) {
7253         decode_slice(avctx, h);
7254     } else {
7255         for(i = 1; i < context_count; i++) {
7256             hx = h->thread_context[i];
7257             hx->s.error_resilience = avctx->error_resilience;
7258             hx->s.error_count = 0;
7259         }
7260
7261         avctx->execute(avctx, (void *)decode_slice,
7262                        (void **)h->thread_context, NULL, context_count);
7263
7264         /* pull back stuff from slices to master context */
7265         hx = h->thread_context[context_count - 1];
7266         s->mb_x = hx->s.mb_x;
7267         s->mb_y = hx->s.mb_y;
7268         s->dropable = hx->s.dropable;
7269         s->picture_structure = hx->s.picture_structure;
7270         for(i = 1; i < context_count; i++)
7271             h->s.error_count += h->thread_context[i]->s.error_count;
7272     }
7273 }
7274
7275
7276 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7277     MpegEncContext * const s = &h->s;
7278     AVCodecContext * const avctx= s->avctx;
7279     int buf_index=0;
7280     H264Context *hx; ///< thread context
7281     int context_count = 0;
7282
7283     h->max_contexts = avctx->thread_count;
7284 #if 0
7285     int i;
7286     for(i=0; i<50; i++){
7287         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7288     }
7289 #endif
7290     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7291         h->current_slice = 0;
7292         if (!s->first_field)
7293             s->current_picture_ptr= NULL;
7294     }
7295
7296     for(;;){
7297         int consumed;
7298         int dst_length;
7299         int bit_length;
7300         const uint8_t *ptr;
7301         int i, nalsize = 0;
7302         int err;
7303
7304         if(h->is_avc) {
7305             if(buf_index >= buf_size) break;
7306             nalsize = 0;
7307             for(i = 0; i < h->nal_length_size; i++)
7308                 nalsize = (nalsize << 8) | buf[buf_index++];
7309             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7310                 if(nalsize == 1){
7311                     buf_index++;
7312                     continue;
7313                 }else{
7314                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7315                     break;
7316                 }
7317             }
7318         } else {
7319             // start code prefix search
7320             for(; buf_index + 3 < buf_size; buf_index++){
7321                 // This should always succeed in the first iteration.
7322                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7323                     break;
7324             }
7325
7326             if(buf_index+3 >= buf_size) break;
7327
7328             buf_index+=3;
7329         }
7330
7331         hx = h->thread_context[context_count];
7332
7333         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7334         if (ptr==NULL || dst_length < 0){
7335             return -1;
7336         }
7337         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7338             dst_length--;
7339         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7340
7341         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7342             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7343         }
7344
7345         if (h->is_avc && (nalsize != consumed)){
7346             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7347             consumed= nalsize;
7348         }
7349
7350         buf_index += consumed;
7351
7352         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7353            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7354             continue;
7355
7356       again:
7357         err = 0;
7358         switch(hx->nal_unit_type){
7359         case NAL_IDR_SLICE:
7360             if (h->nal_unit_type != NAL_IDR_SLICE) {
7361                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7362                 return -1;
7363             }
7364             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7365         case NAL_SLICE:
7366             init_get_bits(&hx->s.gb, ptr, bit_length);
7367             hx->intra_gb_ptr=
7368             hx->inter_gb_ptr= &hx->s.gb;
7369             hx->s.data_partitioning = 0;
7370
7371             if((err = decode_slice_header(hx, h)))
7372                break;
7373
7374             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7375             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7376                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7377                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7378                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7379                && avctx->skip_frame < AVDISCARD_ALL)
7380                 context_count++;
7381             break;
7382         case NAL_DPA:
7383             init_get_bits(&hx->s.gb, ptr, bit_length);
7384             hx->intra_gb_ptr=
7385             hx->inter_gb_ptr= NULL;
7386             hx->s.data_partitioning = 1;
7387
7388             err = decode_slice_header(hx, h);
7389             break;
7390         case NAL_DPB:
7391             init_get_bits(&hx->intra_gb, ptr, bit_length);
7392             hx->intra_gb_ptr= &hx->intra_gb;
7393             break;
7394         case NAL_DPC:
7395             init_get_bits(&hx->inter_gb, ptr, bit_length);
7396             hx->inter_gb_ptr= &hx->inter_gb;
7397
7398             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7399                && s->context_initialized
7400                && s->hurry_up < 5
7401                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7402                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7403                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7404                && avctx->skip_frame < AVDISCARD_ALL)
7405                 context_count++;
7406             break;
7407         case NAL_SEI:
7408             init_get_bits(&s->gb, ptr, bit_length);
7409             decode_sei(h);
7410             break;
7411         case NAL_SPS:
7412             init_get_bits(&s->gb, ptr, bit_length);
7413             decode_seq_parameter_set(h);
7414
7415             if(s->flags& CODEC_FLAG_LOW_DELAY)
7416                 s->low_delay=1;
7417
7418             if(avctx->has_b_frames < 2)
7419                 avctx->has_b_frames= !s->low_delay;
7420             break;
7421         case NAL_PPS:
7422             init_get_bits(&s->gb, ptr, bit_length);
7423
7424             decode_picture_parameter_set(h, bit_length);
7425
7426             break;
7427         case NAL_AUD:
7428         case NAL_END_SEQUENCE:
7429         case NAL_END_STREAM:
7430         case NAL_FILLER_DATA:
7431         case NAL_SPS_EXT:
7432         case NAL_AUXILIARY_SLICE:
7433             break;
7434         default:
7435             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7436         }
7437
7438         if(context_count == h->max_contexts) {
7439             execute_decode_slices(h, context_count);
7440             context_count = 0;
7441         }
7442
7443         if (err < 0)
7444             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7445         else if(err == 1) {
7446             /* Slice could not be decoded in parallel mode, copy down
7447              * NAL unit stuff to context 0 and restart. Note that
7448              * rbsp_buffer is not transferred, but since we no longer
7449              * run in parallel mode this should not be an issue. */
7450             h->nal_unit_type = hx->nal_unit_type;
7451             h->nal_ref_idc   = hx->nal_ref_idc;
7452             hx = h;
7453             goto again;
7454         }
7455     }
7456     if(context_count)
7457         execute_decode_slices(h, context_count);
7458     return buf_index;
7459 }
7460
7461 /**
7462  * returns the number of bytes consumed for building the current frame
7463  */
7464 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7465         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7466         if(pos+10>buf_size) pos=buf_size; // oops ;)
7467
7468         return pos;
7469 }
7470
7471 static int decode_frame(AVCodecContext *avctx,
7472                              void *data, int *data_size,
7473                              const uint8_t *buf, int buf_size)
7474 {
7475     H264Context *h = avctx->priv_data;
7476     MpegEncContext *s = &h->s;
7477     AVFrame *pict = data;
7478     int buf_index;
7479
7480     s->flags= avctx->flags;
7481     s->flags2= avctx->flags2;
7482
7483    /* end of stream, output what is still in the buffers */
7484     if (buf_size == 0) {
7485         Picture *out;
7486         int i, out_idx;
7487
7488 //FIXME factorize this with the output code below
7489         out = h->delayed_pic[0];
7490         out_idx = 0;
7491         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7492             if(h->delayed_pic[i]->poc < out->poc){
7493                 out = h->delayed_pic[i];
7494                 out_idx = i;
7495             }
7496
7497         for(i=out_idx; h->delayed_pic[i]; i++)
7498             h->delayed_pic[i] = h->delayed_pic[i+1];
7499
7500         if(out){
7501             *data_size = sizeof(AVFrame);
7502             *pict= *(AVFrame*)out;
7503         }
7504
7505         return 0;
7506     }
7507
7508     if(h->is_avc && !h->got_avcC) {
7509         int i, cnt, nalsize;
7510         unsigned char *p = avctx->extradata;
7511         if(avctx->extradata_size < 7) {
7512             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7513             return -1;
7514         }
7515         if(*p != 1) {
7516             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7517             return -1;
7518         }
7519         /* sps and pps in the avcC always have length coded with 2 bytes,
7520            so put a fake nal_length_size = 2 while parsing them */
7521         h->nal_length_size = 2;
7522         // Decode sps from avcC
7523         cnt = *(p+5) & 0x1f; // Number of sps
7524         p += 6;
7525         for (i = 0; i < cnt; i++) {
7526             nalsize = AV_RB16(p) + 2;
7527             if(decode_nal_units(h, p, nalsize) < 0) {
7528                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7529                 return -1;
7530             }
7531             p += nalsize;
7532         }
7533         // Decode pps from avcC
7534         cnt = *(p++); // Number of pps
7535         for (i = 0; i < cnt; i++) {
7536             nalsize = AV_RB16(p) + 2;
7537             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7538                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7539                 return -1;
7540             }
7541             p += nalsize;
7542         }
7543         // Now store right nal length size, that will be use to parse all other nals
7544         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7545         // Do not reparse avcC
7546         h->got_avcC = 1;
7547     }
7548
7549     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7550         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7551             return -1;
7552     }
7553
7554     buf_index=decode_nal_units(h, buf, buf_size);
7555     if(buf_index < 0)
7556         return -1;
7557
7558     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7559         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7560         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7561         return -1;
7562     }
7563
7564     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7565         Picture *out = s->current_picture_ptr;
7566         Picture *cur = s->current_picture_ptr;
7567         int i, pics, cross_idr, out_of_order, out_idx;
7568
7569         s->mb_y= 0;
7570
7571         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7572         s->current_picture_ptr->pict_type= s->pict_type;
7573
7574         if(!s->dropable) {
7575             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7576             h->prev_poc_msb= h->poc_msb;
7577             h->prev_poc_lsb= h->poc_lsb;
7578         }
7579         h->prev_frame_num_offset= h->frame_num_offset;
7580         h->prev_frame_num= h->frame_num;
7581
7582         /*
7583          * FIXME: Error handling code does not seem to support interlaced
7584          * when slices span multiple rows
7585          * The ff_er_add_slice calls don't work right for bottom
7586          * fields; they cause massive erroneous error concealing
7587          * Error marking covers both fields (top and bottom).
7588          * This causes a mismatched s->error_count
7589          * and a bad error table. Further, the error count goes to
7590          * INT_MAX when called for bottom field, because mb_y is
7591          * past end by one (callers fault) and resync_mb_y != 0
7592          * causes problems for the first MB line, too.
7593          */
7594         if (!FIELD_PICTURE)
7595             ff_er_frame_end(s);
7596
7597         MPV_frame_end(s);
7598
7599         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7600             /* Wait for second field. */
7601             *data_size = 0;
7602
7603         } else {
7604             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7605             /* Derive top_field_first from field pocs. */
7606             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7607
7608         //FIXME do something with unavailable reference frames
7609
7610             /* Sort B-frames into display order */
7611
7612             if(h->sps.bitstream_restriction_flag
7613                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7614                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7615                 s->low_delay = 0;
7616             }
7617
7618             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7619                && !h->sps.bitstream_restriction_flag){
7620                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7621                 s->low_delay= 0;
7622             }
7623
7624             pics = 0;
7625             while(h->delayed_pic[pics]) pics++;
7626
7627             assert(pics <= MAX_DELAYED_PIC_COUNT);
7628
7629             h->delayed_pic[pics++] = cur;
7630             if(cur->reference == 0)
7631                 cur->reference = DELAYED_PIC_REF;
7632
7633             out = h->delayed_pic[0];
7634             out_idx = 0;
7635             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7636                 if(h->delayed_pic[i]->poc < out->poc){
7637                     out = h->delayed_pic[i];
7638                     out_idx = i;
7639                 }
7640             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7641
7642             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7643
7644             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7645                 { }
7646             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7647                || (s->low_delay &&
7648                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7649                  || cur->pict_type == FF_B_TYPE)))
7650             {
7651                 s->low_delay = 0;
7652                 s->avctx->has_b_frames++;
7653             }
7654
7655             if(out_of_order || pics > s->avctx->has_b_frames){
7656                 out->reference &= ~DELAYED_PIC_REF;
7657                 for(i=out_idx; h->delayed_pic[i]; i++)
7658                     h->delayed_pic[i] = h->delayed_pic[i+1];
7659             }
7660             if(!out_of_order && pics > s->avctx->has_b_frames){
7661                 *data_size = sizeof(AVFrame);
7662
7663                 h->outputed_poc = out->poc;
7664                 *pict= *(AVFrame*)out;
7665             }else{
7666                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7667             }
7668         }
7669     }
7670
7671     assert(pict->data[0] || !*data_size);
7672     ff_print_debug_info(s, pict);
7673 //printf("out %d\n", (int)pict->data[0]);
7674 #if 0 //?
7675
7676     /* Return the Picture timestamp as the frame number */
7677     /* we subtract 1 because it is added on utils.c     */
7678     avctx->frame_number = s->picture_number - 1;
7679 #endif
7680     return get_consumed_bytes(s, buf_index, buf_size);
7681 }
7682 #if 0
7683 static inline void fill_mb_avail(H264Context *h){
7684     MpegEncContext * const s = &h->s;
7685     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7686
7687     if(s->mb_y){
7688         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7689         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7690         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7691     }else{
7692         h->mb_avail[0]=
7693         h->mb_avail[1]=
7694         h->mb_avail[2]= 0;
7695     }
7696     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7697     h->mb_avail[4]= 1; //FIXME move out
7698     h->mb_avail[5]= 0; //FIXME move out
7699 }
7700 #endif
7701
7702 #ifdef TEST
7703 #undef printf
7704 #undef random
7705 #define COUNT 8000
7706 #define SIZE (COUNT*40)
7707 int main(void){
7708     int i;
7709     uint8_t temp[SIZE];
7710     PutBitContext pb;
7711     GetBitContext gb;
7712 //    int int_temp[10000];
7713     DSPContext dsp;
7714     AVCodecContext avctx;
7715
7716     dsputil_init(&dsp, &avctx);
7717
7718     init_put_bits(&pb, temp, SIZE);
7719     printf("testing unsigned exp golomb\n");
7720     for(i=0; i<COUNT; i++){
7721         START_TIMER
7722         set_ue_golomb(&pb, i);
7723         STOP_TIMER("set_ue_golomb");
7724     }
7725     flush_put_bits(&pb);
7726
7727     init_get_bits(&gb, temp, 8*SIZE);
7728     for(i=0; i<COUNT; i++){
7729         int j, s;
7730
7731         s= show_bits(&gb, 24);
7732
7733         START_TIMER
7734         j= get_ue_golomb(&gb);
7735         if(j != i){
7736             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7737 //            return -1;
7738         }
7739         STOP_TIMER("get_ue_golomb");
7740     }
7741
7742
7743     init_put_bits(&pb, temp, SIZE);
7744     printf("testing signed exp golomb\n");
7745     for(i=0; i<COUNT; i++){
7746         START_TIMER
7747         set_se_golomb(&pb, i - COUNT/2);
7748         STOP_TIMER("set_se_golomb");
7749     }
7750     flush_put_bits(&pb);
7751
7752     init_get_bits(&gb, temp, 8*SIZE);
7753     for(i=0; i<COUNT; i++){
7754         int j, s;
7755
7756         s= show_bits(&gb, 24);
7757
7758         START_TIMER
7759         j= get_se_golomb(&gb);
7760         if(j != i - COUNT/2){
7761             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7762 //            return -1;
7763         }
7764         STOP_TIMER("get_se_golomb");
7765     }
7766
7767 #if 0
7768     printf("testing 4x4 (I)DCT\n");
7769
7770     DCTELEM block[16];
7771     uint8_t src[16], ref[16];
7772     uint64_t error= 0, max_error=0;
7773
7774     for(i=0; i<COUNT; i++){
7775         int j;
7776 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7777         for(j=0; j<16; j++){
7778             ref[j]= random()%255;
7779             src[j]= random()%255;
7780         }
7781
7782         h264_diff_dct_c(block, src, ref, 4);
7783
7784         //normalize
7785         for(j=0; j<16; j++){
7786 //            printf("%d ", block[j]);
7787             block[j]= block[j]*4;
7788             if(j&1) block[j]= (block[j]*4 + 2)/5;
7789             if(j&4) block[j]= (block[j]*4 + 2)/5;
7790         }
7791 //        printf("\n");
7792
7793         s->dsp.h264_idct_add(ref, block, 4);
7794 /*        for(j=0; j<16; j++){
7795             printf("%d ", ref[j]);
7796         }
7797         printf("\n");*/
7798
7799         for(j=0; j<16; j++){
7800             int diff= FFABS(src[j] - ref[j]);
7801
7802             error+= diff*diff;
7803             max_error= FFMAX(max_error, diff);
7804         }
7805     }
7806     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7807     printf("testing quantizer\n");
7808     for(qp=0; qp<52; qp++){
7809         for(i=0; i<16; i++)
7810             src1_block[i]= src2_block[i]= random()%255;
7811
7812     }
7813     printf("Testing NAL layer\n");
7814
7815     uint8_t bitstream[COUNT];
7816     uint8_t nal[COUNT*2];
7817     H264Context h;
7818     memset(&h, 0, sizeof(H264Context));
7819
7820     for(i=0; i<COUNT; i++){
7821         int zeros= i;
7822         int nal_length;
7823         int consumed;
7824         int out_length;
7825         uint8_t *out;
7826         int j;
7827
7828         for(j=0; j<COUNT; j++){
7829             bitstream[j]= (random() % 255) + 1;
7830         }
7831
7832         for(j=0; j<zeros; j++){
7833             int pos= random() % COUNT;
7834             while(bitstream[pos] == 0){
7835                 pos++;
7836                 pos %= COUNT;
7837             }
7838             bitstream[pos]=0;
7839         }
7840
7841         START_TIMER
7842
7843         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7844         if(nal_length<0){
7845             printf("encoding failed\n");
7846             return -1;
7847         }
7848
7849         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7850
7851         STOP_TIMER("NAL")
7852
7853         if(out_length != COUNT){
7854             printf("incorrect length %d %d\n", out_length, COUNT);
7855             return -1;
7856         }
7857
7858         if(consumed != nal_length){
7859             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7860             return -1;
7861         }
7862
7863         if(memcmp(bitstream, out, COUNT)){
7864             printf("mismatch\n");
7865             return -1;
7866         }
7867     }
7868 #endif
7869
7870     printf("Testing RBSP\n");
7871
7872
7873     return 0;
7874 }
7875 #endif /* TEST */
7876
7877
7878 static av_cold int decode_end(AVCodecContext *avctx)
7879 {
7880     H264Context *h = avctx->priv_data;
7881     MpegEncContext *s = &h->s;
7882
7883     av_freep(&h->rbsp_buffer[0]);
7884     av_freep(&h->rbsp_buffer[1]);
7885     free_tables(h); //FIXME cleanup init stuff perhaps
7886     MPV_common_end(s);
7887
7888 //    memset(h, 0, sizeof(H264Context));
7889
7890     return 0;
7891 }
7892
7893
7894 AVCodec h264_decoder = {
7895     "h264",
7896     CODEC_TYPE_VIDEO,
7897     CODEC_ID_H264,
7898     sizeof(H264Context),
7899     decode_init,
7900     NULL,
7901     decode_end,
7902     decode_frame,
7903     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7904     .flush= flush_dpb,
7905     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7906 };
7907
7908 #include "svq3.c"