git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  65
  66 static av_always_inline uint32_t pack16to32(int a, int b){
  67 #ifdef WORDS_BIGENDIAN
  68    return (b&0xFFFF) + (a<<16);
  69 #else
  70    return (a&0xFFFF) + (b<<16);
  71 #endif
  72 }
  73
  74 const uint8_t ff_rem6[52]={
  75 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  76 };
  77
  78 const uint8_t ff_div6[52]={
  79 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  80 };
  81
  82 static const int left_block_options[4][8]={
  83     {0,1,2,3,7,10,8,11},
  84     {2,2,3,3,8,11,8,11},
  85     {0,0,1,1,7,10,7,10},
  86     {0,2,0,2,7,10,7,10}
  87 };
  88
  89 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  90     MpegEncContext * const s = &h->s;
  91     const int mb_xy= h->mb_xy;
  92     int topleft_xy, top_xy, topright_xy, left_xy[2];
  93     int topleft_type, top_type, topright_type, left_type[2];
  94     int * left_block;
  95     int topleft_partition= -1;
  96     int i;
  97
  98     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  99
 100     //FIXME deblocking could skip the intra and nnz parts.
 101     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 102         return;
 103
 104     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 105      * stuff, I can't imagine that these complex rules are worth it. */
 106
 107     topleft_xy = top_xy - 1;
 108     topright_xy= top_xy + 1;
 109     left_xy[1] = left_xy[0] = mb_xy-1;
 110     left_block = left_block_options[0];
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block = left_block_options[1];
 150                 } else {
 151                     left_block= left_block_options[2];
 152                 }
 153             } else {
 154                 left_xy[1] += s->mb_stride;
 155                 left_block = left_block_options[3];
 156             }
 157         }
 158     }
 159
 160     h->top_mb_xy = top_xy;
 161     h->left_mb_xy[0] = left_xy[0];
 162     h->left_mb_xy[1] = left_xy[1];
 163     if(for_deblock){
 164         topleft_type = 0;
 165         topright_type = 0;
 166         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 167         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 168         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 169
 170         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 171             int list;
 172             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 173             for(i=0; i<16; i++)
 174                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 175             for(list=0; list<h->list_count; list++){
 176                 if(USES_LIST(mb_type,list)){
 177                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 178                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 179                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 180                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 181                         dst[0] = src[0];
 182                         dst[1] = src[1];
 183                         dst[2] = src[2];
 184                         dst[3] = src[3];
 185                     }
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 188                     ref += h->b8_stride;
 189                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 190                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 191                 }else{
 192                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 193                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 194                 }
 195             }
 196         }
 197     }else{
 198         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 199         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 200         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 201         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 202         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 203     }
 204
 205     if(IS_INTRA(mb_type)){
 206         h->topleft_samples_available=
 207         h->top_samples_available=
 208         h->left_samples_available= 0xFFFF;
 209         h->topright_samples_available= 0xEEEA;
 210
 211         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 212             h->topleft_samples_available= 0xB3FF;
 213             h->top_samples_available= 0x33FF;
 214             h->topright_samples_available= 0x26EA;
 215         }
 216         for(i=0; i<2; i++){
 217             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 218                 h->topleft_samples_available&= 0xDF5F;
 219                 h->left_samples_available&= 0x5F5F;
 220             }
 221         }
 222
 223         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 224             h->topleft_samples_available&= 0x7FFF;
 225
 226         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 227             h->topright_samples_available&= 0xFBFF;
 228
 229         if(IS_INTRA4x4(mb_type)){
 230             if(IS_INTRA4x4(top_type)){
 231                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 232                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 233                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 234                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 235             }else{
 236                 int pred;
 237                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 238                     pred= -1;
 239                 else{
 240                     pred= 2;
 241                 }
 242                 h->intra4x4_pred_mode_cache[4+8*0]=
 243                 h->intra4x4_pred_mode_cache[5+8*0]=
 244                 h->intra4x4_pred_mode_cache[6+8*0]=
 245                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 246             }
 247             for(i=0; i<2; i++){
 248                 if(IS_INTRA4x4(left_type[i])){
 249                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 250                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 251                 }else{
 252                     int pred;
 253                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 254                         pred= -1;
 255                     else{
 256                         pred= 2;
 257                     }
 258                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 259                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 260                 }
 261             }
 262         }
 263     }
 264
 265
 266 /*
 267 0 . T T. T T T T
 268 1 L . .L . . . .
 269 2 L . .L . . . .
 270 3 . T TL . . . .
 271 4 L . .L . . . .
 272 5 L . .. . . . .
 273 */
 274 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 275     if(top_type){
 276         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 277         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 278         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 279         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 280
 281         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 282         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 283
 284         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 285         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 286
 287     }else{
 288         h->non_zero_count_cache[4+8*0]=
 289         h->non_zero_count_cache[5+8*0]=
 290         h->non_zero_count_cache[6+8*0]=
 291         h->non_zero_count_cache[7+8*0]=
 292
 293         h->non_zero_count_cache[1+8*0]=
 294         h->non_zero_count_cache[2+8*0]=
 295
 296         h->non_zero_count_cache[1+8*3]=
 297         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 298
 299     }
 300
 301     for (i=0; i<2; i++) {
 302         if(left_type[i]){
 303             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 304             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 305             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 306             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 307         }else{
 308             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 309             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 310             h->non_zero_count_cache[0+8*1 +   8*i]=
 311             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 312         }
 313     }
 314
 315     if( h->pps.cabac ) {
 316         // top_cbp
 317         if(top_type) {
 318             h->top_cbp = h->cbp_table[top_xy];
 319         } else if(IS_INTRA(mb_type)) {
 320             h->top_cbp = 0x1C0;
 321         } else {
 322             h->top_cbp = 0;
 323         }
 324         // left_cbp
 325         if (left_type[0]) {
 326             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 327         } else if(IS_INTRA(mb_type)) {
 328             h->left_cbp = 0x1C0;
 329         } else {
 330             h->left_cbp = 0;
 331         }
 332         if (left_type[0]) {
 333             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 334         }
 335         if (left_type[1]) {
 336             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 337         }
 338     }
 339
 340 #if 1
 341     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 342         int list;
 343         for(list=0; list<h->list_count; list++){
 344             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 345                 /*if(!h->mv_cache_clean[list]){
 346                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 347                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 348                     h->mv_cache_clean[list]= 1;
 349                 }*/
 350                 continue;
 351             }
 352             h->mv_cache_clean[list]= 0;
 353
 354             if(USES_LIST(top_type, list)){
 355                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 356                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 357                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 358                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 359                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 360                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 361                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 362                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 363                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 364                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 365             }else{
 366                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 367                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 368                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 369                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 370                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 371             }
 372
 373             for(i=0; i<2; i++){
 374                 int cache_idx = scan8[0] - 1 + i*2*8;
 375                 if(USES_LIST(left_type[i], list)){
 376                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 377                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 378                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 379                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 380                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 381                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 382                 }else{
 383                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 384                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 385                     h->ref_cache[list][cache_idx  ]=
 386                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 387                 }
 388             }
 389
 390             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 391                 continue;
 392
 393             if(USES_LIST(topleft_type, list)){
 394                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 395                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 396                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 397                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 398             }else{
 399                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 400                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 401             }
 402
 403             if(USES_LIST(topright_type, list)){
 404                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 405                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 406                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 407                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 408             }else{
 409                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 410                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411             }
 412
 413             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 414                 continue;
 415
 416             h->ref_cache[list][scan8[5 ]+1] =
 417             h->ref_cache[list][scan8[7 ]+1] =
 418             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 419             h->ref_cache[list][scan8[4 ]] =
 420             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 421             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 422             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 423             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 424             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 425             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 426
 427             if( h->pps.cabac ) {
 428                 /* XXX beurk, Load mvd */
 429                 if(USES_LIST(top_type, list)){
 430                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 431                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 432                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 433                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 434                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 435                 }else{
 436                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 437                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 438                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 439                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 440                 }
 441                 if(USES_LIST(left_type[0], list)){
 442                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 443                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 444                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 445                 }else{
 446                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 447                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 448                 }
 449                 if(USES_LIST(left_type[1], list)){
 450                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 456                 }
 457                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 458                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 459                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 460                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 461                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 462
 463                 if(h->slice_type_nos == FF_B_TYPE){
 464                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 465
 466                     if(IS_DIRECT(top_type)){
 467                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 468                     }else if(IS_8X8(top_type)){
 469                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 470                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 471                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 472                     }else{
 473                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 474                     }
 475
 476                     if(IS_DIRECT(left_type[0]))
 477                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 478                     else if(IS_8X8(left_type[0]))
 479                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 480                     else
 481                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 482
 483                     if(IS_DIRECT(left_type[1]))
 484                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 485                     else if(IS_8X8(left_type[1]))
 486                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 487                     else
 488                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 489                 }
 490             }
 491
 492             if(FRAME_MBAFF){
 493 #define MAP_MVS\
 494                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 495                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 496                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 497                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 498                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 499                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 500                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 501                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 502                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 503                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 504                 if(MB_FIELD){
 505 #define MAP_F2F(idx, mb_type)\
 506                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 507                         h->ref_cache[list][idx] <<= 1;\
 508                         h->mv_cache[list][idx][1] /= 2;\
 509                         h->mvd_cache[list][idx][1] /= 2;\
 510                     }
 511                     MAP_MVS
 512 #undef MAP_F2F
 513                 }else{
 514 #define MAP_F2F(idx, mb_type)\
 515                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 516                         h->ref_cache[list][idx] >>= 1;\
 517                         h->mv_cache[list][idx][1] <<= 1;\
 518                         h->mvd_cache[list][idx][1] <<= 1;\
 519                     }
 520                     MAP_MVS
 521 #undef MAP_F2F
 522                 }
 523             }
 524         }
 525     }
 526 #endif
 527
 528     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 529 }
 530
 531 static inline void write_back_intra_pred_mode(H264Context *h){
 532     const int mb_xy= h->mb_xy;
 533
 534     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 535     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 536     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 537     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 538     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 539     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 540     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 541 }
 542
 543 /**
 544  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 545  */
 546 static inline int check_intra4x4_pred_mode(H264Context *h){
 547     MpegEncContext * const s = &h->s;
 548     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 549     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 550     int i;
 551
 552     if(!(h->top_samples_available&0x8000)){
 553         for(i=0; i<4; i++){
 554             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 555             if(status<0){
 556                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 557                 return -1;
 558             } else if(status){
 559                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 560             }
 561         }
 562     }
 563
 564     if(!(h->left_samples_available&0x8000)){
 565         for(i=0; i<4; i++){
 566             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 567             if(status<0){
 568                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 569                 return -1;
 570             } else if(status){
 571                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 572             }
 573         }
 574     }
 575
 576     return 0;
 577 } //FIXME cleanup like next
 578
 579 /**
 580  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 581  */
 582 static inline int check_intra_pred_mode(H264Context *h, int mode){
 583     MpegEncContext * const s = &h->s;
 584     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 585     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 586
 587     if(mode > 6U) {
 588         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 589         return -1;
 590     }
 591
 592     if(!(h->top_samples_available&0x8000)){
 593         mode= top[ mode ];
 594         if(mode<0){
 595             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 596             return -1;
 597         }
 598     }
 599
 600     if(!(h->left_samples_available&0x8000)){
 601         mode= left[ mode ];
 602         if(mode<0){
 603             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 604             return -1;
 605         }
 606     }
 607
 608     return mode;
 609 }
 610
 611 /**
 612  * gets the predicted intra4x4 prediction mode.
 613  */
 614 static inline int pred_intra_mode(H264Context *h, int n){
 615     const int index8= scan8[n];
 616     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 617     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 618     const int min= FFMIN(left, top);
 619
 620     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 621
 622     if(min<0) return DC_PRED;
 623     else      return min;
 624 }
 625
 626 static inline void write_back_non_zero_count(H264Context *h){
 627     const int mb_xy= h->mb_xy;
 628
 629     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 630     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 631     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 632     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 633     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 634     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 635     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 636
 637     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 638     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 639     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 640
 641     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 642     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 643     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 644
 645     if(FRAME_MBAFF){
 646         // store all luma nnzs, for deblocking
 647         int v = 0, i;
 648         for(i=0; i<16; i++)
 649             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 650         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 651     }
 652 }
 653
 654 /**
 655  * gets the predicted number of non-zero coefficients.
 656  * @param n block index
 657  */
 658 static inline int pred_non_zero_count(H264Context *h, int n){
 659     const int index8= scan8[n];
 660     const int left= h->non_zero_count_cache[index8 - 1];
 661     const int top = h->non_zero_count_cache[index8 - 8];
 662     int i= left + top;
 663
 664     if(i<64) i= (i+1)>>1;
 665
 666     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 667
 668     return i&31;
 669 }
 670
 671 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 672     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 673     MpegEncContext *s = &h->s;
 674
 675     /* there is no consistent mapping of mvs to neighboring locations that will
 676      * make mbaff happy, so we can't move all this logic to fill_caches */
 677     if(FRAME_MBAFF){
 678         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 679         const int16_t *mv;
 680         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 681         *C = h->mv_cache[list][scan8[0]-2];
 682
 683         if(!MB_FIELD
 684            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 685             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 686             if(IS_INTERLACED(mb_types[topright_xy])){
 687 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 688                 const int x4 = X4, y4 = Y4;\
 689                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 690                 if(!USES_LIST(mb_type,list))\
 691                     return LIST_NOT_USED;\
 692                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 693                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 694                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 695                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 696
 697                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 698             }
 699         }
 700         if(topright_ref == PART_NOT_AVAILABLE
 701            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 702            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 703             if(!MB_FIELD
 704                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 705                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 706             }
 707             if(MB_FIELD
 708                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 709                && i >= scan8[0]+8){
 710                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 711                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 712             }
 713         }
 714 #undef SET_DIAG_MV
 715     }
 716
 717     if(topright_ref != PART_NOT_AVAILABLE){
 718         *C= h->mv_cache[list][ i - 8 + part_width ];
 719         return topright_ref;
 720     }else{
 721         tprintf(s->avctx, "topright MV not available\n");
 722
 723         *C= h->mv_cache[list][ i - 8 - 1 ];
 724         return h->ref_cache[list][ i - 8 - 1 ];
 725     }
 726 }
 727
 728 /**
 729  * gets the predicted MV.
 730  * @param n the block index
 731  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 732  * @param mx the x component of the predicted motion vector
 733  * @param my the y component of the predicted motion vector
 734  */
 735 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 736     const int index8= scan8[n];
 737     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 738     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 739     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 740     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 741     const int16_t * C;
 742     int diagonal_ref, match_count;
 743
 744     assert(part_width==1 || part_width==2 || part_width==4);
 745
 746 /* mv_cache
 747   B . . A T T T T
 748   U . . L . . , .
 749   U . . L . . . .
 750   U . . L . . , .
 751   . . . L . . . .
 752 */
 753
 754     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 755     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 756     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 757     if(match_count > 1){ //most common
 758         *mx= mid_pred(A[0], B[0], C[0]);
 759         *my= mid_pred(A[1], B[1], C[1]);
 760     }else if(match_count==1){
 761         if(left_ref==ref){
 762             *mx= A[0];
 763             *my= A[1];
 764         }else if(top_ref==ref){
 765             *mx= B[0];
 766             *my= B[1];
 767         }else{
 768             *mx= C[0];
 769             *my= C[1];
 770         }
 771     }else{
 772         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 773             *mx= A[0];
 774             *my= A[1];
 775         }else{
 776             *mx= mid_pred(A[0], B[0], C[0]);
 777             *my= mid_pred(A[1], B[1], C[1]);
 778         }
 779     }
 780
 781     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 782 }
 783
 784 /**
 785  * gets the directionally predicted 16x8 MV.
 786  * @param n the block index
 787  * @param mx the x component of the predicted motion vector
 788  * @param my the y component of the predicted motion vector
 789  */
 790 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 791     if(n==0){
 792         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 793         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 794
 795         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 796
 797         if(top_ref == ref){
 798             *mx= B[0];
 799             *my= B[1];
 800             return;
 801         }
 802     }else{
 803         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 804         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 805
 806         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 807
 808         if(left_ref == ref){
 809             *mx= A[0];
 810             *my= A[1];
 811             return;
 812         }
 813     }
 814
 815     //RARE
 816     pred_motion(h, n, 4, list, ref, mx, my);
 817 }
 818
 819 /**
 820  * gets the directionally predicted 8x16 MV.
 821  * @param n the block index
 822  * @param mx the x component of the predicted motion vector
 823  * @param my the y component of the predicted motion vector
 824  */
 825 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 826     if(n==0){
 827         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 828         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 829
 830         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 831
 832         if(left_ref == ref){
 833             *mx= A[0];
 834             *my= A[1];
 835             return;
 836         }
 837     }else{
 838         const int16_t * C;
 839         int diagonal_ref;
 840
 841         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 842
 843         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 844
 845         if(diagonal_ref == ref){
 846             *mx= C[0];
 847             *my= C[1];
 848             return;
 849         }
 850     }
 851
 852     //RARE
 853     pred_motion(h, n, 2, list, ref, mx, my);
 854 }
 855
 856 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 857     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 858     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 859
 860     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 861
 862     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 863        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 864        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 865
 866         *mx = *my = 0;
 867         return;
 868     }
 869
 870     pred_motion(h, 0, 4, 0, 0, mx, my);
 871
 872     return;
 873 }
 874
 875 static inline void direct_dist_scale_factor(H264Context * const h){
 876     MpegEncContext * const s = &h->s;
 877     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 878     const int poc1 = h->ref_list[1][0].poc;
 879     int i;
 880     for(i=0; i<h->ref_count[0]; i++){
 881         int poc0 = h->ref_list[0][i].poc;
 882         int td = av_clip(poc1 - poc0, -128, 127);
 883         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 884             h->dist_scale_factor[i] = 256;
 885         }else{
 886             int tb = av_clip(poc - poc0, -128, 127);
 887             int tx = (16384 + (FFABS(td) >> 1)) / td;
 888             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 889         }
 890     }
 891     if(FRAME_MBAFF){
 892         for(i=0; i<h->ref_count[0]; i++){
 893             h->dist_scale_factor_field[2*i] =
 894             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 895         }
 896     }
 897 }
 898 static inline void direct_ref_list_init(H264Context * const h){
 899     MpegEncContext * const s = &h->s;
 900     Picture * const ref1 = &h->ref_list[1][0];
 901     Picture * const cur = s->current_picture_ptr;
 902     int list, i, j;
 903     int sidx= s->picture_structure&1;
 904     if(cur->pict_type == FF_I_TYPE)
 905         cur->ref_count[sidx][0] = 0;
 906     if(cur->pict_type != FF_B_TYPE)
 907         cur->ref_count[sidx][1] = 0;
 908     for(list=0; list<2; list++){
 909         cur->ref_count[sidx][list] = h->ref_count[list];
 910         for(j=0; j<h->ref_count[list]; j++)
 911             cur->ref_poc[sidx][list][j] = h->ref_list[list][j].poc;
 912     }
 913     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 914         return;
 915     for(list=0; list<2; list++){
 916         for(i=0; i<ref1->ref_count[sidx][list]; i++){
 917             const int poc = ref1->ref_poc[sidx][list][i];
 918             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 919             for(j=0; j<h->ref_count[list]; j++)
 920                 if(h->ref_list[list][j].poc == poc){
 921                     h->map_col_to_list0[list][i] = j;
 922                     break;
 923                 }
 924         }
 925     }
 926     if(FRAME_MBAFF){
 927         for(list=0; list<2; list++){
 928             for(i=0; i<ref1->ref_count[sidx][list]; i++){
 929                 j = h->map_col_to_list0[list][i];
 930                 h->map_col_to_list0_field[list][2*i] = 2*j;
 931                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 932             }
 933         }
 934     }
 935 }
 936
 937 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 938     MpegEncContext * const s = &h->s;
 939     const int mb_xy =   h->mb_xy;
 940     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 941     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 942     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 943     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 944     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 945     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 946     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 947     const int is_b8x8 = IS_8X8(*mb_type);
 948     unsigned int sub_mb_type;
 949     int i8, i4;
 950
 951 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 952     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 953         /* FIXME save sub mb types from previous frames (or derive from MVs)
 954          * so we know exactly what block size to use */
 955         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 956         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 957     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 958         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 959         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 960     }else{
 961         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 962         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 963     }
 964     if(!is_b8x8)
 965         *mb_type |= MB_TYPE_DIRECT2;
 966     if(MB_FIELD)
 967         *mb_type |= MB_TYPE_INTERLACED;
 968
 969     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 970
 971     if(h->direct_spatial_mv_pred){
 972         int ref[2];
 973         int mv[2][2];
 974         int list;
 975
 976         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 977
 978         /* ref = min(neighbors) */
 979         for(list=0; list<2; list++){
 980             int refa = h->ref_cache[list][scan8[0] - 1];
 981             int refb = h->ref_cache[list][scan8[0] - 8];
 982             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
 983             if(refc == PART_NOT_AVAILABLE)
 984                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
 985             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
 986             if(ref[list] < 0)
 987                 ref[list] = -1;
 988         }
 989
 990         if(ref[0] < 0 && ref[1] < 0){
 991             ref[0] = ref[1] = 0;
 992             mv[0][0] = mv[0][1] =
 993             mv[1][0] = mv[1][1] = 0;
 994         }else{
 995             for(list=0; list<2; list++){
 996                 if(ref[list] >= 0)
 997                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
 998                 else
 999                     mv[list][0] = mv[list][1] = 0;
1000             }
1001         }
1002
1003         if(ref[1] < 0){
1004             if(!is_b8x8)
1005                 *mb_type &= ~MB_TYPE_L1;
1006             sub_mb_type &= ~MB_TYPE_L1;
1007         }else if(ref[0] < 0){
1008             if(!is_b8x8)
1009                 *mb_type &= ~MB_TYPE_L0;
1010             sub_mb_type &= ~MB_TYPE_L0;
1011         }
1012
1013         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1014             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             int mb_types_col[2];
1016             int b8_stride = h->b8_stride;
1017             int b4_stride = h->b_stride;
1018
1019             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1020
1021             if(IS_INTERLACED(*mb_type)){
1022                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1023                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1024                 if(s->mb_y&1){
1025                     l1ref0 -= 2*b8_stride;
1026                     l1ref1 -= 2*b8_stride;
1027                     l1mv0 -= 4*b4_stride;
1028                     l1mv1 -= 4*b4_stride;
1029                 }
1030                 b8_stride *= 3;
1031                 b4_stride *= 6;
1032             }else{
1033                 int cur_poc = s->current_picture_ptr->poc;
1034                 int *col_poc = h->ref_list[1]->field_poc;
1035                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1036                 int dy = 2*col_parity - (s->mb_y&1);
1037                 mb_types_col[0] =
1038                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1039                 l1ref0 += dy*b8_stride;
1040                 l1ref1 += dy*b8_stride;
1041                 l1mv0 += 2*dy*b4_stride;
1042                 l1mv1 += 2*dy*b4_stride;
1043                 b8_stride = 0;
1044             }
1045
1046             for(i8=0; i8<4; i8++){
1047                 int x8 = i8&1;
1048                 int y8 = i8>>1;
1049                 int xy8 = x8+y8*b8_stride;
1050                 int xy4 = 3*x8+y8*b4_stride;
1051                 int a=0, b=0;
1052
1053                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1054                     continue;
1055                 h->sub_mb_type[i8] = sub_mb_type;
1056
1057                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1058                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1059                 if(!IS_INTRA(mb_types_col[y8])
1060                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1061                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1062                     if(ref[0] > 0)
1063                         a= pack16to32(mv[0][0],mv[0][1]);
1064                     if(ref[1] > 0)
1065                         b= pack16to32(mv[1][0],mv[1][1]);
1066                 }else{
1067                     a= pack16to32(mv[0][0],mv[0][1]);
1068                     b= pack16to32(mv[1][0],mv[1][1]);
1069                 }
1070                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1071                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1072             }
1073         }else if(IS_16X16(*mb_type)){
1074             int a=0, b=0;
1075
1076             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1077             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1078             if(!IS_INTRA(mb_type_col)
1079                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1080                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1081                        && (h->x264_build>33 || !h->x264_build)))){
1082                 if(ref[0] > 0)
1083                     a= pack16to32(mv[0][0],mv[0][1]);
1084                 if(ref[1] > 0)
1085                     b= pack16to32(mv[1][0],mv[1][1]);
1086             }else{
1087                 a= pack16to32(mv[0][0],mv[0][1]);
1088                 b= pack16to32(mv[1][0],mv[1][1]);
1089             }
1090             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1091             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1092         }else{
1093             for(i8=0; i8<4; i8++){
1094                 const int x8 = i8&1;
1095                 const int y8 = i8>>1;
1096
1097                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1098                     continue;
1099                 h->sub_mb_type[i8] = sub_mb_type;
1100
1101                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1102                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1103                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1104                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1105
1106                 /* col_zero_flag */
1107                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1108                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1109                                                   && (h->x264_build>33 || !h->x264_build)))){
1110                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1111                     if(IS_SUB_8X8(sub_mb_type)){
1112                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1113                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1114                             if(ref[0] == 0)
1115                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1116                             if(ref[1] == 0)
1117                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1118                         }
1119                     }else
1120                     for(i4=0; i4<4; i4++){
1121                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1122                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1123                             if(ref[0] == 0)
1124                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1125                             if(ref[1] == 0)
1126                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1127                         }
1128                     }
1129                 }
1130             }
1131         }
1132     }else{ /* direct temporal mv pred */
1133         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1134         const int *dist_scale_factor = h->dist_scale_factor;
1135
1136         if(FRAME_MBAFF){
1137             if(IS_INTERLACED(*mb_type)){
1138                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1139                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1140                 dist_scale_factor = h->dist_scale_factor_field;
1141             }
1142             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1143                 /* FIXME assumes direct_8x8_inference == 1 */
1144                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1145                 int mb_types_col[2];
1146                 int y_shift;
1147
1148                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1149                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1150                          | (*mb_type & MB_TYPE_INTERLACED);
1151                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1152
1153                 if(IS_INTERLACED(*mb_type)){
1154                     /* frame to field scaling */
1155                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1156                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1157                     if(s->mb_y&1){
1158                         l1ref0 -= 2*h->b8_stride;
1159                         l1ref1 -= 2*h->b8_stride;
1160                         l1mv0 -= 4*h->b_stride;
1161                         l1mv1 -= 4*h->b_stride;
1162                     }
1163                     y_shift = 0;
1164
1165                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1166                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1167                        && !is_b8x8)
1168                         *mb_type |= MB_TYPE_16x8;
1169                     else
1170                         *mb_type |= MB_TYPE_8x8;
1171                 }else{
1172                     /* field to frame scaling */
1173                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1174                      * but in MBAFF, top and bottom POC are equal */
1175                     int dy = (s->mb_y&1) ? 1 : 2;
1176                     mb_types_col[0] =
1177                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1178                     l1ref0 += dy*h->b8_stride;
1179                     l1ref1 += dy*h->b8_stride;
1180                     l1mv0 += 2*dy*h->b_stride;
1181                     l1mv1 += 2*dy*h->b_stride;
1182                     y_shift = 2;
1183
1184                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1185                        && !is_b8x8)
1186                         *mb_type |= MB_TYPE_16x16;
1187                     else
1188                         *mb_type |= MB_TYPE_8x8;
1189                 }
1190
1191                 for(i8=0; i8<4; i8++){
1192                     const int x8 = i8&1;
1193                     const int y8 = i8>>1;
1194                     int ref0, scale;
1195                     const int16_t (*l1mv)[2]= l1mv0;
1196
1197                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1198                         continue;
1199                     h->sub_mb_type[i8] = sub_mb_type;
1200
1201                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1202                     if(IS_INTRA(mb_types_col[y8])){
1203                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1204                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1205                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1206                         continue;
1207                     }
1208
1209                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1210                     if(ref0 >= 0)
1211                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1212                     else{
1213                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1214                         l1mv= l1mv1;
1215                     }
1216                     scale = dist_scale_factor[ref0];
1217                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1218
1219                     {
1220                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1221                         int my_col = (mv_col[1]<<y_shift)/2;
1222                         int mx = (scale * mv_col[0] + 128) >> 8;
1223                         int my = (scale * my_col + 128) >> 8;
1224                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1225                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1226                     }
1227                 }
1228                 return;
1229             }
1230         }
1231
1232         /* one-to-one mv scaling */
1233
1234         if(IS_16X16(*mb_type)){
1235             int ref, mv0, mv1;
1236
1237             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1238             if(IS_INTRA(mb_type_col)){
1239                 ref=mv0=mv1=0;
1240             }else{
1241                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1242                                                 : map_col_to_list0[1][l1ref1[0]];
1243                 const int scale = dist_scale_factor[ref0];
1244                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1245                 int mv_l0[2];
1246                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1247                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1248                 ref= ref0;
1249                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1250                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1251             }
1252             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1253             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1254             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1255         }else{
1256             for(i8=0; i8<4; i8++){
1257                 const int x8 = i8&1;
1258                 const int y8 = i8>>1;
1259                 int ref0, scale;
1260                 const int16_t (*l1mv)[2]= l1mv0;
1261
1262                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1263                     continue;
1264                 h->sub_mb_type[i8] = sub_mb_type;
1265                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1266                 if(IS_INTRA(mb_type_col)){
1267                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1268                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1269                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1270                     continue;
1271                 }
1272
1273                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1274                 if(ref0 >= 0)
1275                     ref0 = map_col_to_list0[0][ref0];
1276                 else{
1277                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1278                     l1mv= l1mv1;
1279                 }
1280                 scale = dist_scale_factor[ref0];
1281
1282                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1283                 if(IS_SUB_8X8(sub_mb_type)){
1284                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1285                     int mx = (scale * mv_col[0] + 128) >> 8;
1286                     int my = (scale * mv_col[1] + 128) >> 8;
1287                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1288                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1289                 }else
1290                 for(i4=0; i4<4; i4++){
1291                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1292                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1293                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1294                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1295                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1296                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1297                 }
1298             }
1299         }
1300     }
1301 }
1302
1303 static inline void write_back_motion(H264Context *h, int mb_type){
1304     MpegEncContext * const s = &h->s;
1305     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1306     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1307     int list;
1308
1309     if(!USES_LIST(mb_type, 0))
1310         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1311
1312     for(list=0; list<h->list_count; list++){
1313         int y;
1314         if(!USES_LIST(mb_type, list))
1315             continue;
1316
1317         for(y=0; y<4; y++){
1318             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1319             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1320         }
1321         if( h->pps.cabac ) {
1322             if(IS_SKIP(mb_type))
1323                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1324             else
1325             for(y=0; y<4; y++){
1326                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1327                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1328             }
1329         }
1330
1331         {
1332             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1333             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1334             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1335             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1336             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1337         }
1338     }
1339
1340     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1341         if(IS_8X8(mb_type)){
1342             uint8_t *direct_table = &h->direct_table[b8_xy];
1343             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1344             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1345             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1346         }
1347     }
1348 }
1349
1350 /**
1351  * Decodes a network abstraction layer unit.
1352  * @param consumed is the number of bytes used as input
1353  * @param length is the length of the array
1354  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1355  * @returns decoded bytes, might be src+1 if no escapes
1356  */
1357 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1358     int i, si, di;
1359     uint8_t *dst;
1360     int bufidx;
1361
1362 //    src[0]&0x80;                //forbidden bit
1363     h->nal_ref_idc= src[0]>>5;
1364     h->nal_unit_type= src[0]&0x1F;
1365
1366     src++; length--;
1367 #if 0
1368     for(i=0; i<length; i++)
1369         printf("%2X ", src[i]);
1370 #endif
1371     for(i=0; i+1<length; i+=2){
1372         if(src[i]) continue;
1373         if(i>0 && src[i-1]==0) i--;
1374         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1375             if(src[i+2]!=3){
1376                 /* startcode, so we must be past the end */
1377                 length=i;
1378             }
1379             break;
1380         }
1381     }
1382
1383     if(i>=length-1){ //no escaped 0
1384         *dst_length= length;
1385         *consumed= length+1; //+1 for the header
1386         return src;
1387     }
1388
1389     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1390     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1391     dst= h->rbsp_buffer[bufidx];
1392
1393     if (dst == NULL){
1394         return NULL;
1395     }
1396
1397 //printf("decoding esc\n");
1398     si=di=0;
1399     while(si<length){
1400         //remove escapes (very rare 1:2^22)
1401         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1402             if(src[si+2]==3){ //escape
1403                 dst[di++]= 0;
1404                 dst[di++]= 0;
1405                 si+=3;
1406                 continue;
1407             }else //next start code
1408                 break;
1409         }
1410
1411         dst[di++]= src[si++];
1412     }
1413
1414     *dst_length= di;
1415     *consumed= si + 1;//+1 for the header
1416 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1417     return dst;
1418 }
1419
1420 /**
1421  * identifies the exact end of the bitstream
1422  * @return the length of the trailing, or 0 if damaged
1423  */
1424 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1425     int v= *src;
1426     int r;
1427
1428     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1429
1430     for(r=1; r<9; r++){
1431         if(v&1) return r;
1432         v>>=1;
1433     }
1434     return 0;
1435 }
1436
1437 /**
1438  * IDCT transforms the 16 dc values and dequantizes them.
1439  * @param qp quantization parameter
1440  */
1441 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1442 #define stride 16
1443     int i;
1444     int temp[16]; //FIXME check if this is a good idea
1445     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1446     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1447
1448 //memset(block, 64, 2*256);
1449 //return;
1450     for(i=0; i<4; i++){
1451         const int offset= y_offset[i];
1452         const int z0= block[offset+stride*0] + block[offset+stride*4];
1453         const int z1= block[offset+stride*0] - block[offset+stride*4];
1454         const int z2= block[offset+stride*1] - block[offset+stride*5];
1455         const int z3= block[offset+stride*1] + block[offset+stride*5];
1456
1457         temp[4*i+0]= z0+z3;
1458         temp[4*i+1]= z1+z2;
1459         temp[4*i+2]= z1-z2;
1460         temp[4*i+3]= z0-z3;
1461     }
1462
1463     for(i=0; i<4; i++){
1464         const int offset= x_offset[i];
1465         const int z0= temp[4*0+i] + temp[4*2+i];
1466         const int z1= temp[4*0+i] - temp[4*2+i];
1467         const int z2= temp[4*1+i] - temp[4*3+i];
1468         const int z3= temp[4*1+i] + temp[4*3+i];
1469
1470         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1471         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1472         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1473         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1474     }
1475 }
1476
1477 #if 0
1478 /**
1479  * DCT transforms the 16 dc values.
1480  * @param qp quantization parameter ??? FIXME
1481  */
1482 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1483 //    const int qmul= dequant_coeff[qp][0];
1484     int i;
1485     int temp[16]; //FIXME check if this is a good idea
1486     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1487     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1488
1489     for(i=0; i<4; i++){
1490         const int offset= y_offset[i];
1491         const int z0= block[offset+stride*0] + block[offset+stride*4];
1492         const int z1= block[offset+stride*0] - block[offset+stride*4];
1493         const int z2= block[offset+stride*1] - block[offset+stride*5];
1494         const int z3= block[offset+stride*1] + block[offset+stride*5];
1495
1496         temp[4*i+0]= z0+z3;
1497         temp[4*i+1]= z1+z2;
1498         temp[4*i+2]= z1-z2;
1499         temp[4*i+3]= z0-z3;
1500     }
1501
1502     for(i=0; i<4; i++){
1503         const int offset= x_offset[i];
1504         const int z0= temp[4*0+i] + temp[4*2+i];
1505         const int z1= temp[4*0+i] - temp[4*2+i];
1506         const int z2= temp[4*1+i] - temp[4*3+i];
1507         const int z3= temp[4*1+i] + temp[4*3+i];
1508
1509         block[stride*0 +offset]= (z0 + z3)>>1;
1510         block[stride*2 +offset]= (z1 + z2)>>1;
1511         block[stride*8 +offset]= (z1 - z2)>>1;
1512         block[stride*10+offset]= (z0 - z3)>>1;
1513     }
1514 }
1515 #endif
1516
1517 #undef xStride
1518 #undef stride
1519
1520 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1521     const int stride= 16*2;
1522     const int xStride= 16;
1523     int a,b,c,d,e;
1524
1525     a= block[stride*0 + xStride*0];
1526     b= block[stride*0 + xStride*1];
1527     c= block[stride*1 + xStride*0];
1528     d= block[stride*1 + xStride*1];
1529
1530     e= a-b;
1531     a= a+b;
1532     b= c-d;
1533     c= c+d;
1534
1535     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1536     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1537     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1538     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1539 }
1540
1541 #if 0
1542 static void chroma_dc_dct_c(DCTELEM *block){
1543     const int stride= 16*2;
1544     const int xStride= 16;
1545     int a,b,c,d,e;
1546
1547     a= block[stride*0 + xStride*0];
1548     b= block[stride*0 + xStride*1];
1549     c= block[stride*1 + xStride*0];
1550     d= block[stride*1 + xStride*1];
1551
1552     e= a-b;
1553     a= a+b;
1554     b= c-d;
1555     c= c+d;
1556
1557     block[stride*0 + xStride*0]= (a+c);
1558     block[stride*0 + xStride*1]= (e+b);
1559     block[stride*1 + xStride*0]= (a-c);
1560     block[stride*1 + xStride*1]= (e-b);
1561 }
1562 #endif
1563
1564 /**
1565  * gets the chroma qp.
1566  */
1567 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1568     return h->pps.chroma_qp_table[t][qscale];
1569 }
1570
1571 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1572 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1573 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1574     int i;
1575     const int * const quant_table= quant_coeff[qscale];
1576     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1577     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1578     const unsigned int threshold2= (threshold1<<1);
1579     int last_non_zero;
1580
1581     if(separate_dc){
1582         if(qscale<=18){
1583             //avoid overflows
1584             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1585             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1586             const unsigned int dc_threshold2= (dc_threshold1<<1);
1587
1588             int level= block[0]*quant_coeff[qscale+18][0];
1589             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1590                 if(level>0){
1591                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1592                     block[0]= level;
1593                 }else{
1594                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1595                     block[0]= -level;
1596                 }
1597 //                last_non_zero = i;
1598             }else{
1599                 block[0]=0;
1600             }
1601         }else{
1602             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1603             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1604             const unsigned int dc_threshold2= (dc_threshold1<<1);
1605
1606             int level= block[0]*quant_table[0];
1607             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1608                 if(level>0){
1609                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1610                     block[0]= level;
1611                 }else{
1612                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1613                     block[0]= -level;
1614                 }
1615 //                last_non_zero = i;
1616             }else{
1617                 block[0]=0;
1618             }
1619         }
1620         last_non_zero= 0;
1621         i=1;
1622     }else{
1623         last_non_zero= -1;
1624         i=0;
1625     }
1626
1627     for(; i<16; i++){
1628         const int j= scantable[i];
1629         int level= block[j]*quant_table[j];
1630
1631 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1632 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1633         if(((unsigned)(level+threshold1))>threshold2){
1634             if(level>0){
1635                 level= (bias + level)>>QUANT_SHIFT;
1636                 block[j]= level;
1637             }else{
1638                 level= (bias - level)>>QUANT_SHIFT;
1639                 block[j]= -level;
1640             }
1641             last_non_zero = i;
1642         }else{
1643             block[j]=0;
1644         }
1645     }
1646
1647     return last_non_zero;
1648 }
1649
1650 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1651                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1652                            int src_x_offset, int src_y_offset,
1653                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1654     MpegEncContext * const s = &h->s;
1655     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1656     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1657     const int luma_xy= (mx&3) + ((my&3)<<2);
1658     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1659     uint8_t * src_cb, * src_cr;
1660     int extra_width= h->emu_edge_width;
1661     int extra_height= h->emu_edge_height;
1662     int emu=0;
1663     const int full_mx= mx>>2;
1664     const int full_my= my>>2;
1665     const int pic_width  = 16*s->mb_width;
1666     const int pic_height = 16*s->mb_height >> MB_FIELD;
1667
1668     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1669         return;
1670
1671     if(mx&7) extra_width -= 3;
1672     if(my&7) extra_height -= 3;
1673
1674     if(   full_mx < 0-extra_width
1675        || full_my < 0-extra_height
1676        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1677        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1678         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1679             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1680         emu=1;
1681     }
1682
1683     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1684     if(!square){
1685         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1686     }
1687
1688     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1689
1690     if(MB_FIELD){
1691         // chroma offset when predicting from a field of opposite parity
1692         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1693         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1694     }
1695     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1696     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1697
1698     if(emu){
1699         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1700             src_cb= s->edge_emu_buffer;
1701     }
1702     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1703
1704     if(emu){
1705         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1706             src_cr= s->edge_emu_buffer;
1707     }
1708     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1709 }
1710
1711 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1712                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1713                            int x_offset, int y_offset,
1714                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1715                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1716                            int list0, int list1){
1717     MpegEncContext * const s = &h->s;
1718     qpel_mc_func *qpix_op=  qpix_put;
1719     h264_chroma_mc_func chroma_op= chroma_put;
1720
1721     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1722     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1723     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1724     x_offset += 8*s->mb_x;
1725     y_offset += 8*(s->mb_y >> MB_FIELD);
1726
1727     if(list0){
1728         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1729         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1730                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1731                            qpix_op, chroma_op);
1732
1733         qpix_op=  qpix_avg;
1734         chroma_op= chroma_avg;
1735     }
1736
1737     if(list1){
1738         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1739         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1740                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1741                            qpix_op, chroma_op);
1742     }
1743 }
1744
1745 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1746                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1747                            int x_offset, int y_offset,
1748                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1749                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1750                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1751                            int list0, int list1){
1752     MpegEncContext * const s = &h->s;
1753
1754     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1755     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1756     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1757     x_offset += 8*s->mb_x;
1758     y_offset += 8*(s->mb_y >> MB_FIELD);
1759
1760     if(list0 && list1){
1761         /* don't optimize for luma-only case, since B-frames usually
1762          * use implicit weights => chroma too. */
1763         uint8_t *tmp_cb = s->obmc_scratchpad;
1764         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1765         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1766         int refn0 = h->ref_cache[0][ scan8[n] ];
1767         int refn1 = h->ref_cache[1][ scan8[n] ];
1768
1769         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1770                     dest_y, dest_cb, dest_cr,
1771                     x_offset, y_offset, qpix_put, chroma_put);
1772         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1773                     tmp_y, tmp_cb, tmp_cr,
1774                     x_offset, y_offset, qpix_put, chroma_put);
1775
1776         if(h->use_weight == 2){
1777             int weight0 = h->implicit_weight[refn0][refn1];
1778             int weight1 = 64 - weight0;
1779             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1780             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1781             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1782         }else{
1783             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1784                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1785                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1786             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1787                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1788                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1789             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1790                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1791                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1792         }
1793     }else{
1794         int list = list1 ? 1 : 0;
1795         int refn = h->ref_cache[list][ scan8[n] ];
1796         Picture *ref= &h->ref_list[list][refn];
1797         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1798                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1799                     qpix_put, chroma_put);
1800
1801         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1802                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1803         if(h->use_weight_chroma){
1804             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1805                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1806             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1807                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1808         }
1809     }
1810 }
1811
1812 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1813                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1814                            int x_offset, int y_offset,
1815                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1816                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1817                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1818                            int list0, int list1){
1819     if((h->use_weight==2 && list0 && list1
1820         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1821        || h->use_weight==1)
1822         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1823                          x_offset, y_offset, qpix_put, chroma_put,
1824                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1825     else
1826         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1827                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1828 }
1829
1830 static inline void prefetch_motion(H264Context *h, int list){
1831     /* fetch pixels for estimated mv 4 macroblocks ahead
1832      * optimized for 64byte cache lines */
1833     MpegEncContext * const s = &h->s;
1834     const int refn = h->ref_cache[list][scan8[0]];
1835     if(refn >= 0){
1836         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1837         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1838         uint8_t **src= h->ref_list[list][refn].data;
1839         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1840         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1841         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1842         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1843     }
1844 }
1845
1846 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1847                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1848                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1849                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1850     MpegEncContext * const s = &h->s;
1851     const int mb_xy= h->mb_xy;
1852     const int mb_type= s->current_picture.mb_type[mb_xy];
1853
1854     assert(IS_INTER(mb_type));
1855
1856     prefetch_motion(h, 0);
1857
1858     if(IS_16X16(mb_type)){
1859         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1860                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1861                 &weight_op[0], &weight_avg[0],
1862                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1863     }else if(IS_16X8(mb_type)){
1864         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1865                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1866                 &weight_op[1], &weight_avg[1],
1867                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1868         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1869                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1870                 &weight_op[1], &weight_avg[1],
1871                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1872     }else if(IS_8X16(mb_type)){
1873         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1874                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1875                 &weight_op[2], &weight_avg[2],
1876                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1877         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1878                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1879                 &weight_op[2], &weight_avg[2],
1880                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1881     }else{
1882         int i;
1883
1884         assert(IS_8X8(mb_type));
1885
1886         for(i=0; i<4; i++){
1887             const int sub_mb_type= h->sub_mb_type[i];
1888             const int n= 4*i;
1889             int x_offset= (i&1)<<2;
1890             int y_offset= (i&2)<<1;
1891
1892             if(IS_SUB_8X8(sub_mb_type)){
1893                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1894                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1895                     &weight_op[3], &weight_avg[3],
1896                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1897             }else if(IS_SUB_8X4(sub_mb_type)){
1898                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1899                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1900                     &weight_op[4], &weight_avg[4],
1901                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1902                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1903                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1904                     &weight_op[4], &weight_avg[4],
1905                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1906             }else if(IS_SUB_4X8(sub_mb_type)){
1907                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1908                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1909                     &weight_op[5], &weight_avg[5],
1910                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1911                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1912                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1913                     &weight_op[5], &weight_avg[5],
1914                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1915             }else{
1916                 int j;
1917                 assert(IS_SUB_4X4(sub_mb_type));
1918                 for(j=0; j<4; j++){
1919                     int sub_x_offset= x_offset + 2*(j&1);
1920                     int sub_y_offset= y_offset +   (j&2);
1921                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1922                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1923                         &weight_op[6], &weight_avg[6],
1924                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925                 }
1926             }
1927         }
1928     }
1929
1930     prefetch_motion(h, 1);
1931 }
1932
1933 static av_cold void decode_init_vlc(void){
1934     static int done = 0;
1935
1936     if (!done) {
1937         int i;
1938         done = 1;
1939
1940         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1941                  &chroma_dc_coeff_token_len [0], 1, 1,
1942                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1943
1944         for(i=0; i<4; i++){
1945             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1946                      &coeff_token_len [i][0], 1, 1,
1947                      &coeff_token_bits[i][0], 1, 1, 1);
1948         }
1949
1950         for(i=0; i<3; i++){
1951             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1952                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1953                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1954         }
1955         for(i=0; i<15; i++){
1956             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1957                      &total_zeros_len [i][0], 1, 1,
1958                      &total_zeros_bits[i][0], 1, 1, 1);
1959         }
1960
1961         for(i=0; i<6; i++){
1962             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1, 1);
1965         }
1966         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1967                  &run_len [6][0], 1, 1,
1968                  &run_bits[6][0], 1, 1, 1);
1969     }
1970 }
1971
1972 static void free_tables(H264Context *h){
1973     int i;
1974     H264Context *hx;
1975     av_freep(&h->intra4x4_pred_mode);
1976     av_freep(&h->chroma_pred_mode_table);
1977     av_freep(&h->cbp_table);
1978     av_freep(&h->mvd_table[0]);
1979     av_freep(&h->mvd_table[1]);
1980     av_freep(&h->direct_table);
1981     av_freep(&h->non_zero_count);
1982     av_freep(&h->slice_table_base);
1983     h->slice_table= NULL;
1984
1985     av_freep(&h->mb2b_xy);
1986     av_freep(&h->mb2b8_xy);
1987
1988     for(i = 0; i < MAX_SPS_COUNT; i++)
1989         av_freep(h->sps_buffers + i);
1990
1991     for(i = 0; i < MAX_PPS_COUNT; i++)
1992         av_freep(h->pps_buffers + i);
1993
1994     for(i = 0; i < h->s.avctx->thread_count; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = ff_div6[q];
2017             int idx = ff_rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = ff_div6[q] + 2;
2042             int idx = ff_rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2161     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2162 }
2163
2164 static av_cold int decode_init(AVCodecContext *avctx){
2165     H264Context *h= avctx->priv_data;
2166     MpegEncContext * const s = &h->s;
2167
2168     MPV_decode_defaults(s);
2169
2170     s->avctx = avctx;
2171     common_init(h);
2172
2173     s->out_format = FMT_H264;
2174     s->workaround_bugs= avctx->workaround_bugs;
2175
2176     // set defaults
2177 //    s->decode_mb= ff_h263_decode_mb;
2178     s->quarter_sample = 1;
2179     s->low_delay= 1;
2180
2181     if(avctx->codec_id == CODEC_ID_SVQ3)
2182         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2183     else
2184         avctx->pix_fmt= PIX_FMT_YUV420P;
2185
2186     decode_init_vlc();
2187
2188     if(avctx->extradata_size > 0 && avctx->extradata &&
2189        *(char *)avctx->extradata == 1){
2190         h->is_avc = 1;
2191         h->got_avcC = 0;
2192     } else {
2193         h->is_avc = 0;
2194     }
2195
2196     h->thread_context[0] = h;
2197     h->outputed_poc = INT_MIN;
2198     return 0;
2199 }
2200
2201 static int frame_start(H264Context *h){
2202     MpegEncContext * const s = &h->s;
2203     int i;
2204
2205     if(MPV_frame_start(s, s->avctx) < 0)
2206         return -1;
2207     ff_er_frame_start(s);
2208     /*
2209      * MPV_frame_start uses pict_type to derive key_frame.
2210      * This is incorrect for H.264; IDR markings must be used.
2211      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2212      * See decode_nal_units().
2213      */
2214     s->current_picture_ptr->key_frame= 0;
2215
2216     assert(s->linesize && s->uvlinesize);
2217
2218     for(i=0; i<16; i++){
2219         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2220         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2221     }
2222     for(i=0; i<4; i++){
2223         h->block_offset[16+i]=
2224         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2225         h->block_offset[24+16+i]=
2226         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2227     }
2228
2229     /* can't be in alloc_tables because linesize isn't known there.
2230      * FIXME: redo bipred weight to not require extra buffer? */
2231     for(i = 0; i < s->avctx->thread_count; i++)
2232         if(!h->thread_context[i]->s.obmc_scratchpad)
2233             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2234
2235     /* some macroblocks will be accessed before they're available */
2236     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2237         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2238
2239 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2240
2241     // We mark the current picture as non-reference after allocating it, so
2242     // that if we break out due to an error it can be released automatically
2243     // in the next MPV_frame_start().
2244     // SVQ3 as well as most other codecs have only last/next/current and thus
2245     // get released even with set reference, besides SVQ3 and others do not
2246     // mark frames as reference later "naturally".
2247     if(s->codec_id != CODEC_ID_SVQ3)
2248         s->current_picture_ptr->reference= 0;
2249
2250     s->current_picture_ptr->field_poc[0]=
2251     s->current_picture_ptr->field_poc[1]= INT_MAX;
2252     assert(s->current_picture_ptr->long_ref==0);
2253
2254     return 0;
2255 }
2256
2257 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2258     MpegEncContext * const s = &h->s;
2259     int i;
2260
2261     src_y  -=   linesize;
2262     src_cb -= uvlinesize;
2263     src_cr -= uvlinesize;
2264
2265     // There are two lines saved, the line above the the top macroblock of a pair,
2266     // and the line above the bottom macroblock
2267     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2268     for(i=1; i<17; i++){
2269         h->left_border[i]= src_y[15+i*  linesize];
2270     }
2271
2272     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2273     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2274
2275     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2276         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2277         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2278         for(i=1; i<9; i++){
2279             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2280             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2281         }
2282         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2283         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2284     }
2285 }
2286
2287 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2288     MpegEncContext * const s = &h->s;
2289     int temp8, i;
2290     uint64_t temp64;
2291     int deblock_left;
2292     int deblock_top;
2293     int mb_xy;
2294
2295     if(h->deblocking_filter == 2) {
2296         mb_xy = h->mb_xy;
2297         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2298         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2299     } else {
2300         deblock_left = (s->mb_x > 0);
2301         deblock_top =  (s->mb_y > 0);
2302     }
2303
2304     src_y  -=   linesize + 1;
2305     src_cb -= uvlinesize + 1;
2306     src_cr -= uvlinesize + 1;
2307
2308 #define XCHG(a,b,t,xchg)\
2309 t= a;\
2310 if(xchg)\
2311     a= b;\
2312 b= t;
2313
2314     if(deblock_left){
2315         for(i = !deblock_top; i<17; i++){
2316             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2317         }
2318     }
2319
2320     if(deblock_top){
2321         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2322         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2323         if(s->mb_x+1 < s->mb_width){
2324             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2325         }
2326     }
2327
2328     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2329         if(deblock_left){
2330             for(i = !deblock_top; i<9; i++){
2331                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2332                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2333             }
2334         }
2335         if(deblock_top){
2336             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2338         }
2339     }
2340 }
2341
2342 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2343     MpegEncContext * const s = &h->s;
2344     int i;
2345
2346     src_y  -= 2 *   linesize;
2347     src_cb -= 2 * uvlinesize;
2348     src_cr -= 2 * uvlinesize;
2349
2350     // There are two lines saved, the line above the the top macroblock of a pair,
2351     // and the line above the bottom macroblock
2352     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2353     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2354     for(i=2; i<34; i++){
2355         h->left_border[i]= src_y[15+i*  linesize];
2356     }
2357
2358     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2359     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2360     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2361     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2362
2363     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2364         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2365         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2366         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2367         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2368         for(i=2; i<18; i++){
2369             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2370             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2371         }
2372         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2373         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2374         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2375         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2376     }
2377 }
2378
2379 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2380     MpegEncContext * const s = &h->s;
2381     int temp8, i;
2382     uint64_t temp64;
2383     int deblock_left = (s->mb_x > 0);
2384     int deblock_top  = (s->mb_y > 1);
2385
2386     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2387
2388     src_y  -= 2 *   linesize + 1;
2389     src_cb -= 2 * uvlinesize + 1;
2390     src_cr -= 2 * uvlinesize + 1;
2391
2392 #define XCHG(a,b,t,xchg)\
2393 t= a;\
2394 if(xchg)\
2395     a= b;\
2396 b= t;
2397
2398     if(deblock_left){
2399         for(i = (!deblock_top)<<1; i<34; i++){
2400             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2401         }
2402     }
2403
2404     if(deblock_top){
2405         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2406         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2407         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2408         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2409         if(s->mb_x+1 < s->mb_width){
2410             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2411             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2412         }
2413     }
2414
2415     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2416         if(deblock_left){
2417             for(i = (!deblock_top) << 1; i<18; i++){
2418                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2419                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2420             }
2421         }
2422         if(deblock_top){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2425             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2426             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2427         }
2428     }
2429 }
2430
2431 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2432     MpegEncContext * const s = &h->s;
2433     const int mb_x= s->mb_x;
2434     const int mb_y= s->mb_y;
2435     const int mb_xy= h->mb_xy;
2436     const int mb_type= s->current_picture.mb_type[mb_xy];
2437     uint8_t  *dest_y, *dest_cb, *dest_cr;
2438     int linesize, uvlinesize /*dct_offset*/;
2439     int i;
2440     int *block_offset = &h->block_offset[0];
2441     const unsigned int bottom = mb_y & 1;
2442     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2443     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2444     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2445
2446     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2447     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2448     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2449
2450     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2451     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2452
2453     if (!simple && MB_FIELD) {
2454         linesize   = h->mb_linesize   = s->linesize * 2;
2455         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2456         block_offset = &h->block_offset[24];
2457         if(mb_y&1){ //FIXME move out of this function?
2458             dest_y -= s->linesize*15;
2459             dest_cb-= s->uvlinesize*7;
2460             dest_cr-= s->uvlinesize*7;
2461         }
2462         if(FRAME_MBAFF) {
2463             int list;
2464             for(list=0; list<h->list_count; list++){
2465                 if(!USES_LIST(mb_type, list))
2466                     continue;
2467                 if(IS_16X16(mb_type)){
2468                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2469                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2470                 }else{
2471                     for(i=0; i<16; i+=4){
2472                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2473                         int ref = h->ref_cache[list][scan8[i]];
2474                         if(ref >= 0)
2475                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2476                     }
2477                 }
2478             }
2479         }
2480     } else {
2481         linesize   = h->mb_linesize   = s->linesize;
2482         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2483 //        dct_offset = s->linesize * 16;
2484     }
2485
2486     if(transform_bypass){
2487         idct_dc_add =
2488         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2489     }else if(IS_8x8DCT(mb_type)){
2490         idct_dc_add = s->dsp.h264_idct8_dc_add;
2491         idct_add = s->dsp.h264_idct8_add;
2492     }else{
2493         idct_dc_add = s->dsp.h264_idct_dc_add;
2494         idct_add = s->dsp.h264_idct_add;
2495     }
2496
2497     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2498        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2499         int mbt_y = mb_y&~1;
2500         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2501         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2502         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2503         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2504     }
2505
2506     if (!simple && IS_INTRA_PCM(mb_type)) {
2507         for (i=0; i<16; i++) {
2508             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2509         }
2510         for (i=0; i<8; i++) {
2511             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2512             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2513         }
2514     } else {
2515         if(IS_INTRA(mb_type)){
2516             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2517                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2518
2519             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2520                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2521                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2522             }
2523
2524             if(IS_INTRA4x4(mb_type)){
2525                 if(simple || !s->encoding){
2526                     if(IS_8x8DCT(mb_type)){
2527                         for(i=0; i<16; i+=4){
2528                             uint8_t * const ptr= dest_y + block_offset[i];
2529                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2530                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2531                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2532                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2533                             if(nnz){
2534                                 if(nnz == 1 && h->mb[i*16])
2535                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2536                                 else
2537                                     idct_add(ptr, h->mb + i*16, linesize);
2538                             }
2539                         }
2540                     }else
2541                     for(i=0; i<16; i++){
2542                         uint8_t * const ptr= dest_y + block_offset[i];
2543                         uint8_t *topright;
2544                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2545                         int nnz, tr;
2546
2547                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2548                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2549                             assert(mb_y || linesize <= block_offset[i]);
2550                             if(!topright_avail){
2551                                 tr= ptr[3 - linesize]*0x01010101;
2552                                 topright= (uint8_t*) &tr;
2553                             }else
2554                                 topright= ptr + 4 - linesize;
2555                         }else
2556                             topright= NULL;
2557
2558                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2559                         nnz = h->non_zero_count_cache[ scan8[i] ];
2560                         if(nnz){
2561                             if(is_h264){
2562                                 if(nnz == 1 && h->mb[i*16])
2563                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2564                                 else
2565                                     idct_add(ptr, h->mb + i*16, linesize);
2566                             }else
2567                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2568                         }
2569                     }
2570                 }
2571             }else{
2572                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2573                 if(is_h264){
2574                     if(!transform_bypass)
2575                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2576                 }else
2577                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2578             }
2579             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2580                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2581         }else if(is_h264){
2582             hl_motion(h, dest_y, dest_cb, dest_cr,
2583                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2584                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2585                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2586         }
2587
2588
2589         if(!IS_INTRA4x4(mb_type)){
2590             if(is_h264){
2591                 if(IS_INTRA16x16(mb_type)){
2592                     for(i=0; i<16; i++){
2593                         if(h->non_zero_count_cache[ scan8[i] ])
2594                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2595                         else if(h->mb[i*16])
2596                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2597                     }
2598                 }else{
2599                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2600                     for(i=0; i<16; i+=di){
2601                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2602                         if(nnz){
2603                             if(nnz==1 && h->mb[i*16])
2604                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2605                             else
2606                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2607                         }
2608                     }
2609                 }
2610             }else{
2611                 for(i=0; i<16; i++){
2612                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2613                         uint8_t * const ptr= dest_y + block_offset[i];
2614                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2615                     }
2616                 }
2617             }
2618         }
2619
2620         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2621             uint8_t *dest[2] = {dest_cb, dest_cr};
2622             if(transform_bypass){
2623                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2624             }else{
2625                 idct_add = s->dsp.h264_idct_add;
2626                 idct_dc_add = s->dsp.h264_idct_dc_add;
2627                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2628                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2629             }
2630             if(is_h264){
2631                 for(i=16; i<16+8; i++){
2632                     if(h->non_zero_count_cache[ scan8[i] ])
2633                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2634                     else if(h->mb[i*16])
2635                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2636                 }
2637             }else{
2638                 for(i=16; i<16+8; i++){
2639                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2640                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2641                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2642                     }
2643                 }
2644             }
2645         }
2646     }
2647     if(h->deblocking_filter) {
2648         if (!simple && FRAME_MBAFF) {
2649             //FIXME try deblocking one mb at a time?
2650             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2651             const int mb_y = s->mb_y - 1;
2652             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2653             const int mb_xy= mb_x + mb_y*s->mb_stride;
2654             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2655             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2656             if (!bottom) return;
2657             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2658             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2659             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2660
2661             if(IS_INTRA(mb_type_top | mb_type_bottom))
2662                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2663
2664             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2665             // deblock a pair
2666             // top
2667             s->mb_y--; h->mb_xy -= s->mb_stride;
2668             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2669             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2670             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2671             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2672             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2673             // bottom
2674             s->mb_y++; h->mb_xy += s->mb_stride;
2675             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2676             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2677             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2678             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2679             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2680         } else {
2681             tprintf(h->s.avctx, "call filter_mb\n");
2682             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2683             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2684             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2685             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2686             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2687         }
2688     }
2689 }
2690
2691 /**
2692  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2693  */
2694 static void hl_decode_mb_simple(H264Context *h){
2695     hl_decode_mb_internal(h, 1);
2696 }
2697
2698 /**
2699  * Process a macroblock; this handles edge cases, such as interlacing.
2700  */
2701 static void av_noinline hl_decode_mb_complex(H264Context *h){
2702     hl_decode_mb_internal(h, 0);
2703 }
2704
2705 static void hl_decode_mb(H264Context *h){
2706     MpegEncContext * const s = &h->s;
2707     const int mb_xy= h->mb_xy;
2708     const int mb_type= s->current_picture.mb_type[mb_xy];
2709     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2710                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2711
2712     if(ENABLE_H264_ENCODER && !s->decode)
2713         return;
2714
2715     if (is_complex)
2716         hl_decode_mb_complex(h);
2717     else hl_decode_mb_simple(h);
2718 }
2719
2720 static void pic_as_field(Picture *pic, const int parity){
2721     int i;
2722     for (i = 0; i < 4; ++i) {
2723         if (parity == PICT_BOTTOM_FIELD)
2724             pic->data[i] += pic->linesize[i];
2725         pic->reference = parity;
2726         pic->linesize[i] *= 2;
2727     }
2728     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2729 }
2730
2731 static int split_field_copy(Picture *dest, Picture *src,
2732                             int parity, int id_add){
2733     int match = !!(src->reference & parity);
2734
2735     if (match) {
2736         *dest = *src;
2737         if(parity != PICT_FRAME){
2738             pic_as_field(dest, parity);
2739             dest->pic_id *= 2;
2740             dest->pic_id += id_add;
2741         }
2742     }
2743
2744     return match;
2745 }
2746
2747 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2748     int i[2]={0};
2749     int index=0;
2750
2751     while(i[0]<len || i[1]<len){
2752         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2753             i[0]++;
2754         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2755             i[1]++;
2756         if(i[0] < len){
2757             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2758             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2759         }
2760         if(i[1] < len){
2761             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2762             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2763         }
2764     }
2765
2766     return index;
2767 }
2768
2769 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2770     int i, best_poc;
2771     int out_i= 0;
2772
2773     for(;;){
2774         best_poc= dir ? INT_MIN : INT_MAX;
2775
2776         for(i=0; i<len; i++){
2777             const int poc= src[i]->poc;
2778             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2779                 best_poc= poc;
2780                 sorted[out_i]= src[i];
2781             }
2782         }
2783         if(best_poc == (dir ? INT_MIN : INT_MAX))
2784             break;
2785         limit= sorted[out_i++]->poc - dir;
2786     }
2787     return out_i;
2788 }
2789
2790 /**
2791  * fills the default_ref_list.
2792  */
2793 static int fill_default_ref_list(H264Context *h){
2794     MpegEncContext * const s = &h->s;
2795     int i, len;
2796
2797     if(h->slice_type_nos==FF_B_TYPE){
2798         Picture *sorted[32];
2799         int cur_poc, list;
2800         int lens[2];
2801
2802         if(FIELD_PICTURE)
2803             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2804         else
2805             cur_poc= s->current_picture_ptr->poc;
2806
2807         for(list= 0; list<2; list++){
2808             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2809             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2810             assert(len<=32);
2811             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2812             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2813             assert(len<=32);
2814
2815             if(len < h->ref_count[list])
2816                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2817             lens[list]= len;
2818         }
2819
2820         if(lens[0] == lens[1] && lens[1] > 1){
2821             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2822             if(i == lens[0])
2823                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2824         }
2825     }else{
2826         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2827         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2828         assert(len <= 32);
2829         if(len < h->ref_count[0])
2830             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2831     }
2832 #ifdef TRACE
2833     for (i=0; i<h->ref_count[0]; i++) {
2834         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2835     }
2836     if(h->slice_type_nos==FF_B_TYPE){
2837         for (i=0; i<h->ref_count[1]; i++) {
2838             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2839         }
2840     }
2841 #endif
2842     return 0;
2843 }
2844
2845 static void print_short_term(H264Context *h);
2846 static void print_long_term(H264Context *h);
2847
2848 /**
2849  * Extract structure information about the picture described by pic_num in
2850  * the current decoding context (frame or field). Note that pic_num is
2851  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2852  * @param pic_num picture number for which to extract structure information
2853  * @param structure one of PICT_XXX describing structure of picture
2854  *                      with pic_num
2855  * @return frame number (short term) or long term index of picture
2856  *         described by pic_num
2857  */
2858 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2859     MpegEncContext * const s = &h->s;
2860
2861     *structure = s->picture_structure;
2862     if(FIELD_PICTURE){
2863         if (!(pic_num & 1))
2864             /* opposite field */
2865             *structure ^= PICT_FRAME;
2866         pic_num >>= 1;
2867     }
2868
2869     return pic_num;
2870 }
2871
2872 static int decode_ref_pic_list_reordering(H264Context *h){
2873     MpegEncContext * const s = &h->s;
2874     int list, index, pic_structure;
2875
2876     print_short_term(h);
2877     print_long_term(h);
2878
2879     for(list=0; list<h->list_count; list++){
2880         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2881
2882         if(get_bits1(&s->gb)){
2883             int pred= h->curr_pic_num;
2884
2885             for(index=0; ; index++){
2886                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2887                 unsigned int pic_id;
2888                 int i;
2889                 Picture *ref = NULL;
2890
2891                 if(reordering_of_pic_nums_idc==3)
2892                     break;
2893
2894                 if(index >= h->ref_count[list]){
2895                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2896                     return -1;
2897                 }
2898
2899                 if(reordering_of_pic_nums_idc<3){
2900                     if(reordering_of_pic_nums_idc<2){
2901                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2902                         int frame_num;
2903
2904                         if(abs_diff_pic_num > h->max_pic_num){
2905                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2906                             return -1;
2907                         }
2908
2909                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2910                         else                                pred+= abs_diff_pic_num;
2911                         pred &= h->max_pic_num - 1;
2912
2913                         frame_num = pic_num_extract(h, pred, &pic_structure);
2914
2915                         for(i= h->short_ref_count-1; i>=0; i--){
2916                             ref = h->short_ref[i];
2917                             assert(ref->reference);
2918                             assert(!ref->long_ref);
2919                             if(
2920                                    ref->frame_num == frame_num &&
2921                                    (ref->reference & pic_structure)
2922                               )
2923                                 break;
2924                         }
2925                         if(i>=0)
2926                             ref->pic_id= pred;
2927                     }else{
2928                         int long_idx;
2929                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2930
2931                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2932
2933                         if(long_idx>31){
2934                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2935                             return -1;
2936                         }
2937                         ref = h->long_ref[long_idx];
2938                         assert(!(ref && !ref->reference));
2939                         if(ref && (ref->reference & pic_structure)){
2940                             ref->pic_id= pic_id;
2941                             assert(ref->long_ref);
2942                             i=0;
2943                         }else{
2944                             i=-1;
2945                         }
2946                     }
2947
2948                     if (i < 0) {
2949                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2950                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2951                     } else {
2952                         for(i=index; i+1<h->ref_count[list]; i++){
2953                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2954                                 break;
2955                         }
2956                         for(; i > index; i--){
2957                             h->ref_list[list][i]= h->ref_list[list][i-1];
2958                         }
2959                         h->ref_list[list][index]= *ref;
2960                         if (FIELD_PICTURE){
2961                             pic_as_field(&h->ref_list[list][index], pic_structure);
2962                         }
2963                     }
2964                 }else{
2965                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2966                     return -1;
2967                 }
2968             }
2969         }
2970     }
2971     for(list=0; list<h->list_count; list++){
2972         for(index= 0; index < h->ref_count[list]; index++){
2973             if(!h->ref_list[list][index].data[0]){
2974                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2975                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2976             }
2977         }
2978     }
2979
2980     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
2981         direct_dist_scale_factor(h);
2982     direct_ref_list_init(h);
2983     return 0;
2984 }
2985
2986 static void fill_mbaff_ref_list(H264Context *h){
2987     int list, i, j;
2988     for(list=0; list<2; list++){ //FIXME try list_count
2989         for(i=0; i<h->ref_count[list]; i++){
2990             Picture *frame = &h->ref_list[list][i];
2991             Picture *field = &h->ref_list[list][16+2*i];
2992             field[0] = *frame;
2993             for(j=0; j<3; j++)
2994                 field[0].linesize[j] <<= 1;
2995             field[0].reference = PICT_TOP_FIELD;
2996             field[1] = field[0];
2997             for(j=0; j<3; j++)
2998                 field[1].data[j] += frame->linesize[j];
2999             field[1].reference = PICT_BOTTOM_FIELD;
3000
3001             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3002             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3003             for(j=0; j<2; j++){
3004                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3005                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3006             }
3007         }
3008     }
3009     for(j=0; j<h->ref_count[1]; j++){
3010         for(i=0; i<h->ref_count[0]; i++)
3011             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3012         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3013         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3014     }
3015 }
3016
3017 static int pred_weight_table(H264Context *h){
3018     MpegEncContext * const s = &h->s;
3019     int list, i;
3020     int luma_def, chroma_def;
3021
3022     h->use_weight= 0;
3023     h->use_weight_chroma= 0;
3024     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3025     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3026     luma_def = 1<<h->luma_log2_weight_denom;
3027     chroma_def = 1<<h->chroma_log2_weight_denom;
3028
3029     for(list=0; list<2; list++){
3030         for(i=0; i<h->ref_count[list]; i++){
3031             int luma_weight_flag, chroma_weight_flag;
3032
3033             luma_weight_flag= get_bits1(&s->gb);
3034             if(luma_weight_flag){
3035                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3036                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3037                 if(   h->luma_weight[list][i] != luma_def
3038                    || h->luma_offset[list][i] != 0)
3039                     h->use_weight= 1;
3040             }else{
3041                 h->luma_weight[list][i]= luma_def;
3042                 h->luma_offset[list][i]= 0;
3043             }
3044
3045             if(CHROMA){
3046                 chroma_weight_flag= get_bits1(&s->gb);
3047                 if(chroma_weight_flag){
3048                     int j;
3049                     for(j=0; j<2; j++){
3050                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3051                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3052                         if(   h->chroma_weight[list][i][j] != chroma_def
3053                         || h->chroma_offset[list][i][j] != 0)
3054                             h->use_weight_chroma= 1;
3055                     }
3056                 }else{
3057                     int j;
3058                     for(j=0; j<2; j++){
3059                         h->chroma_weight[list][i][j]= chroma_def;
3060                         h->chroma_offset[list][i][j]= 0;
3061                     }
3062                 }
3063             }
3064         }
3065         if(h->slice_type_nos != FF_B_TYPE) break;
3066     }
3067     h->use_weight= h->use_weight || h->use_weight_chroma;
3068     return 0;
3069 }
3070
3071 static void implicit_weight_table(H264Context *h){
3072     MpegEncContext * const s = &h->s;
3073     int ref0, ref1;
3074     int cur_poc = s->current_picture_ptr->poc;
3075
3076     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3077        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3078         h->use_weight= 0;
3079         h->use_weight_chroma= 0;
3080         return;
3081     }
3082
3083     h->use_weight= 2;
3084     h->use_weight_chroma= 2;
3085     h->luma_log2_weight_denom= 5;
3086     h->chroma_log2_weight_denom= 5;
3087
3088     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3089         int poc0 = h->ref_list[0][ref0].poc;
3090         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3091             int poc1 = h->ref_list[1][ref1].poc;
3092             int td = av_clip(poc1 - poc0, -128, 127);
3093             if(td){
3094                 int tb = av_clip(cur_poc - poc0, -128, 127);
3095                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3096                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3097                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3098                     h->implicit_weight[ref0][ref1] = 32;
3099                 else
3100                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3101             }else
3102                 h->implicit_weight[ref0][ref1] = 32;
3103         }
3104     }
3105 }
3106
3107 /**
3108  * Mark a picture as no longer needed for reference. The refmask
3109  * argument allows unreferencing of individual fields or the whole frame.
3110  * If the picture becomes entirely unreferenced, but is being held for
3111  * display purposes, it is marked as such.
3112  * @param refmask mask of fields to unreference; the mask is bitwise
3113  *                anded with the reference marking of pic
3114  * @return non-zero if pic becomes entirely unreferenced (except possibly
3115  *         for display purposes) zero if one of the fields remains in
3116  *         reference
3117  */
3118 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3119     int i;
3120     if (pic->reference &= refmask) {
3121         return 0;
3122     } else {
3123         for(i = 0; h->delayed_pic[i]; i++)
3124             if(pic == h->delayed_pic[i]){
3125                 pic->reference=DELAYED_PIC_REF;
3126                 break;
3127             }
3128         return 1;
3129     }
3130 }
3131
3132 /**
3133  * instantaneous decoder refresh.
3134  */
3135 static void idr(H264Context *h){
3136     int i;
3137
3138     for(i=0; i<16; i++){
3139         remove_long(h, i, 0);
3140     }
3141     assert(h->long_ref_count==0);
3142
3143     for(i=0; i<h->short_ref_count; i++){
3144         unreference_pic(h, h->short_ref[i], 0);
3145         h->short_ref[i]= NULL;
3146     }
3147     h->short_ref_count=0;
3148     h->prev_frame_num= 0;
3149     h->prev_frame_num_offset= 0;
3150     h->prev_poc_msb=
3151     h->prev_poc_lsb= 0;
3152 }
3153
3154 /* forget old pics after a seek */
3155 static void flush_dpb(AVCodecContext *avctx){
3156     H264Context *h= avctx->priv_data;
3157     int i;
3158     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3159         if(h->delayed_pic[i])
3160             h->delayed_pic[i]->reference= 0;
3161         h->delayed_pic[i]= NULL;
3162     }
3163     h->outputed_poc= INT_MIN;
3164     idr(h);
3165     if(h->s.current_picture_ptr)
3166         h->s.current_picture_ptr->reference= 0;
3167     h->s.first_field= 0;
3168     ff_mpeg_flush(avctx);
3169 }
3170
3171 /**
3172  * Find a Picture in the short term reference list by frame number.
3173  * @param frame_num frame number to search for
3174  * @param idx the index into h->short_ref where returned picture is found
3175  *            undefined if no picture found.
3176  * @return pointer to the found picture, or NULL if no pic with the provided
3177  *                 frame number is found
3178  */
3179 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3180     MpegEncContext * const s = &h->s;
3181     int i;
3182
3183     for(i=0; i<h->short_ref_count; i++){
3184         Picture *pic= h->short_ref[i];
3185         if(s->avctx->debug&FF_DEBUG_MMCO)
3186             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3187         if(pic->frame_num == frame_num) {
3188             *idx = i;
3189             return pic;
3190         }
3191     }
3192     return NULL;
3193 }
3194
3195 /**
3196  * Remove a picture from the short term reference list by its index in
3197  * that list.  This does no checking on the provided index; it is assumed
3198  * to be valid. Other list entries are shifted down.
3199  * @param i index into h->short_ref of picture to remove.
3200  */
3201 static void remove_short_at_index(H264Context *h, int i){
3202     assert(i >= 0 && i < h->short_ref_count);
3203     h->short_ref[i]= NULL;
3204     if (--h->short_ref_count)
3205         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3206 }
3207
3208 /**
3209  *
3210  * @return the removed picture or NULL if an error occurs
3211  */
3212 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3213     MpegEncContext * const s = &h->s;
3214     Picture *pic;
3215     int i;
3216
3217     if(s->avctx->debug&FF_DEBUG_MMCO)
3218         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3219
3220     pic = find_short(h, frame_num, &i);
3221     if (pic){
3222         if(unreference_pic(h, pic, ref_mask))
3223         remove_short_at_index(h, i);
3224     }
3225
3226     return pic;
3227 }
3228
3229 /**
3230  * Remove a picture from the long term reference list by its index in
3231  * that list.
3232  * @return the removed picture or NULL if an error occurs
3233  */
3234 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3235     Picture *pic;
3236
3237     pic= h->long_ref[i];
3238     if (pic){
3239         if(unreference_pic(h, pic, ref_mask)){
3240             assert(h->long_ref[i]->long_ref == 1);
3241             h->long_ref[i]->long_ref= 0;
3242             h->long_ref[i]= NULL;
3243             h->long_ref_count--;
3244         }
3245     }
3246
3247     return pic;
3248 }
3249
3250 /**
3251  * print short term list
3252  */
3253 static void print_short_term(H264Context *h) {
3254     uint32_t i;
3255     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3256         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3257         for(i=0; i<h->short_ref_count; i++){
3258             Picture *pic= h->short_ref[i];
3259             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3260         }
3261     }
3262 }
3263
3264 /**
3265  * print long term list
3266  */
3267 static void print_long_term(H264Context *h) {
3268     uint32_t i;
3269     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3270         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3271         for(i = 0; i < 16; i++){
3272             Picture *pic= h->long_ref[i];
3273             if (pic) {
3274                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3275             }
3276         }
3277     }
3278 }
3279
3280 /**
3281  * Executes the reference picture marking (memory management control operations).
3282  */
3283 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3284     MpegEncContext * const s = &h->s;
3285     int i, j;
3286     int current_ref_assigned=0;
3287     Picture *pic;
3288
3289     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3290         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3291
3292     for(i=0; i<mmco_count; i++){
3293         int structure, frame_num;
3294         if(s->avctx->debug&FF_DEBUG_MMCO)
3295             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3296
3297         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3298            || mmco[i].opcode == MMCO_SHORT2LONG){
3299             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3300             pic = find_short(h, frame_num, &j);
3301             if(!pic){
3302                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3303                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3304                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3305                 continue;
3306             }
3307         }
3308
3309         switch(mmco[i].opcode){
3310         case MMCO_SHORT2UNUSED:
3311             if(s->avctx->debug&FF_DEBUG_MMCO)
3312                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3313             remove_short(h, frame_num, structure ^ PICT_FRAME);
3314             break;
3315         case MMCO_SHORT2LONG:
3316                 if (h->long_ref[mmco[i].long_arg] != pic)
3317                     remove_long(h, mmco[i].long_arg, 0);
3318
3319                 remove_short_at_index(h, j);
3320                 h->long_ref[ mmco[i].long_arg ]= pic;
3321                 if (h->long_ref[ mmco[i].long_arg ]){
3322                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3323                     h->long_ref_count++;
3324                 }
3325             break;
3326         case MMCO_LONG2UNUSED:
3327             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3328             pic = h->long_ref[j];
3329             if (pic) {
3330                 remove_long(h, j, structure ^ PICT_FRAME);
3331             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3332                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3333             break;
3334         case MMCO_LONG:
3335                     // Comment below left from previous code as it is an interresting note.
3336                     /* First field in pair is in short term list or
3337                      * at a different long term index.
3338                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3339                      * Report the problem and keep the pair where it is,
3340                      * and mark this field valid.
3341                      */
3342
3343             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3344                 remove_long(h, mmco[i].long_arg, 0);
3345
3346                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3347                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3348                 h->long_ref_count++;
3349             }
3350
3351             s->current_picture_ptr->reference |= s->picture_structure;
3352             current_ref_assigned=1;
3353             break;
3354         case MMCO_SET_MAX_LONG:
3355             assert(mmco[i].long_arg <= 16);
3356             // just remove the long term which index is greater than new max
3357             for(j = mmco[i].long_arg; j<16; j++){
3358                 remove_long(h, j, 0);
3359             }
3360             break;
3361         case MMCO_RESET:
3362             while(h->short_ref_count){
3363                 remove_short(h, h->short_ref[0]->frame_num, 0);
3364             }
3365             for(j = 0; j < 16; j++) {
3366                 remove_long(h, j, 0);
3367             }
3368             s->current_picture_ptr->poc=
3369             s->current_picture_ptr->field_poc[0]=
3370             s->current_picture_ptr->field_poc[1]=
3371             h->poc_lsb=
3372             h->poc_msb=
3373             h->frame_num=
3374             s->current_picture_ptr->frame_num= 0;
3375             break;
3376         default: assert(0);
3377         }
3378     }
3379
3380     if (!current_ref_assigned) {
3381         /* Second field of complementary field pair; the first field of
3382          * which is already referenced. If short referenced, it
3383          * should be first entry in short_ref. If not, it must exist
3384          * in long_ref; trying to put it on the short list here is an
3385          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3386          */
3387         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3388             /* Just mark the second field valid */
3389             s->current_picture_ptr->reference = PICT_FRAME;
3390         } else if (s->current_picture_ptr->long_ref) {
3391             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3392                                              "assignment for second field "
3393                                              "in complementary field pair "
3394                                              "(first field is long term)\n");
3395         } else {
3396             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3397             if(pic){
3398                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3399             }
3400
3401             if(h->short_ref_count)
3402                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3403
3404             h->short_ref[0]= s->current_picture_ptr;
3405             h->short_ref_count++;
3406             s->current_picture_ptr->reference |= s->picture_structure;
3407         }
3408     }
3409
3410     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3411
3412         /* We have too many reference frames, probably due to corrupted
3413          * stream. Need to discard one frame. Prevents overrun of the
3414          * short_ref and long_ref buffers.
3415          */
3416         av_log(h->s.avctx, AV_LOG_ERROR,
3417                "number of reference frames exceeds max (probably "
3418                "corrupt input), discarding one\n");
3419
3420         if (h->long_ref_count && !h->short_ref_count) {
3421             for (i = 0; i < 16; ++i)
3422                 if (h->long_ref[i])
3423                     break;
3424
3425             assert(i < 16);
3426             remove_long(h, i, 0);
3427         } else {
3428             pic = h->short_ref[h->short_ref_count - 1];
3429             remove_short(h, pic->frame_num, 0);
3430         }
3431     }
3432
3433     print_short_term(h);
3434     print_long_term(h);
3435     return 0;
3436 }
3437
3438 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3439     MpegEncContext * const s = &h->s;
3440     int i;
3441
3442     h->mmco_index= 0;
3443     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3444         s->broken_link= get_bits1(gb) -1;
3445         if(get_bits1(gb)){
3446             h->mmco[0].opcode= MMCO_LONG;
3447             h->mmco[0].long_arg= 0;
3448             h->mmco_index= 1;
3449         }
3450     }else{
3451         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3452             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3453                 MMCOOpcode opcode= get_ue_golomb(gb);
3454
3455                 h->mmco[i].opcode= opcode;
3456                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3457                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3458 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3459                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3460                         return -1;
3461                     }*/
3462                 }
3463                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3464                     unsigned int long_arg= get_ue_golomb(gb);
3465                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3466                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3467                         return -1;
3468                     }
3469                     h->mmco[i].long_arg= long_arg;
3470                 }
3471
3472                 if(opcode > (unsigned)MMCO_LONG){
3473                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3474                     return -1;
3475                 }
3476                 if(opcode == MMCO_END)
3477                     break;
3478             }
3479             h->mmco_index= i;
3480         }else{
3481             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3482
3483             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3484                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3485                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3486                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3487                 h->mmco_index= 1;
3488                 if (FIELD_PICTURE) {
3489                     h->mmco[0].short_pic_num *= 2;
3490                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3491                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3492                     h->mmco_index= 2;
3493                 }
3494             }
3495         }
3496     }
3497
3498     return 0;
3499 }
3500
3501 static int init_poc(H264Context *h){
3502     MpegEncContext * const s = &h->s;
3503     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3504     int field_poc[2];
3505     Picture *cur = s->current_picture_ptr;
3506
3507     h->frame_num_offset= h->prev_frame_num_offset;
3508     if(h->frame_num < h->prev_frame_num)
3509         h->frame_num_offset += max_frame_num;
3510
3511     if(h->sps.poc_type==0){
3512         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3513
3514         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3515             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3516         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3517             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3518         else
3519             h->poc_msb = h->prev_poc_msb;
3520 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3521         field_poc[0] =
3522         field_poc[1] = h->poc_msb + h->poc_lsb;
3523         if(s->picture_structure == PICT_FRAME)
3524             field_poc[1] += h->delta_poc_bottom;
3525     }else if(h->sps.poc_type==1){
3526         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3527         int i;
3528
3529         if(h->sps.poc_cycle_length != 0)
3530             abs_frame_num = h->frame_num_offset + h->frame_num;
3531         else
3532             abs_frame_num = 0;
3533
3534         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3535             abs_frame_num--;
3536
3537         expected_delta_per_poc_cycle = 0;
3538         for(i=0; i < h->sps.poc_cycle_length; i++)
3539             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3540
3541         if(abs_frame_num > 0){
3542             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3543             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3544
3545             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3546             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3547                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3548         } else
3549             expectedpoc = 0;
3550
3551         if(h->nal_ref_idc == 0)
3552             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3553
3554         field_poc[0] = expectedpoc + h->delta_poc[0];
3555         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3556
3557         if(s->picture_structure == PICT_FRAME)
3558             field_poc[1] += h->delta_poc[1];
3559     }else{
3560         int poc= 2*(h->frame_num_offset + h->frame_num);
3561
3562         if(!h->nal_ref_idc)
3563             poc--;
3564
3565         field_poc[0]= poc;
3566         field_poc[1]= poc;
3567     }
3568
3569     if(s->picture_structure != PICT_BOTTOM_FIELD)
3570         s->current_picture_ptr->field_poc[0]= field_poc[0];
3571     if(s->picture_structure != PICT_TOP_FIELD)
3572         s->current_picture_ptr->field_poc[1]= field_poc[1];
3573     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3574
3575     return 0;
3576 }
3577
3578
3579 /**
3580  * initialize scan tables
3581  */
3582 static void init_scan_tables(H264Context *h){
3583     MpegEncContext * const s = &h->s;
3584     int i;
3585     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3586         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3587         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3588     }else{
3589         for(i=0; i<16; i++){
3590 #define T(x) (x>>2) | ((x<<2) & 0xF)
3591             h->zigzag_scan[i] = T(zigzag_scan[i]);
3592             h-> field_scan[i] = T( field_scan[i]);
3593 #undef T
3594         }
3595     }
3596     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3597         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3598         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3599         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3600         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3601     }else{
3602         for(i=0; i<64; i++){
3603 #define T(x) (x>>3) | ((x&7)<<3)
3604             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3605             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3606             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3607             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3608 #undef T
3609         }
3610     }
3611     if(h->sps.transform_bypass){ //FIXME same ugly
3612         h->zigzag_scan_q0          = zigzag_scan;
3613         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3614         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3615         h->field_scan_q0           = field_scan;
3616         h->field_scan8x8_q0        = field_scan8x8;
3617         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3618     }else{
3619         h->zigzag_scan_q0          = h->zigzag_scan;
3620         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3621         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3622         h->field_scan_q0           = h->field_scan;
3623         h->field_scan8x8_q0        = h->field_scan8x8;
3624         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3625     }
3626 }
3627
3628 /**
3629  * Replicates H264 "master" context to thread contexts.
3630  */
3631 static void clone_slice(H264Context *dst, H264Context *src)
3632 {
3633     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3634     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3635     dst->s.current_picture      = src->s.current_picture;
3636     dst->s.linesize             = src->s.linesize;
3637     dst->s.uvlinesize           = src->s.uvlinesize;
3638     dst->s.first_field          = src->s.first_field;
3639
3640     dst->prev_poc_msb           = src->prev_poc_msb;
3641     dst->prev_poc_lsb           = src->prev_poc_lsb;
3642     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3643     dst->prev_frame_num         = src->prev_frame_num;
3644     dst->short_ref_count        = src->short_ref_count;
3645
3646     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3647     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3648     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3649     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3650
3651     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3652     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3653 }
3654
3655 /**
3656  * decodes a slice header.
3657  * This will also call MPV_common_init() and frame_start() as needed.
3658  *
3659  * @param h h264context
3660  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3661  *
3662  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3663  */
3664 static int decode_slice_header(H264Context *h, H264Context *h0){
3665     MpegEncContext * const s = &h->s;
3666     MpegEncContext * const s0 = &h0->s;
3667     unsigned int first_mb_in_slice;
3668     unsigned int pps_id;
3669     int num_ref_idx_active_override_flag;
3670     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3671     unsigned int slice_type, tmp, i, j;
3672     int default_ref_list_done = 0;
3673     int last_pic_structure;
3674
3675     s->dropable= h->nal_ref_idc == 0;
3676
3677     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3678         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3679         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3680     }else{
3681         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3682         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3683     }
3684
3685     first_mb_in_slice= get_ue_golomb(&s->gb);
3686
3687     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3688         h0->current_slice = 0;
3689         if (!s0->first_field)
3690             s->current_picture_ptr= NULL;
3691     }
3692
3693     slice_type= get_ue_golomb(&s->gb);
3694     if(slice_type > 9){
3695         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3696         return -1;
3697     }
3698     if(slice_type > 4){
3699         slice_type -= 5;
3700         h->slice_type_fixed=1;
3701     }else
3702         h->slice_type_fixed=0;
3703
3704     slice_type= slice_type_map[ slice_type ];
3705     if (slice_type == FF_I_TYPE
3706         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3707         default_ref_list_done = 1;
3708     }
3709     h->slice_type= slice_type;
3710     h->slice_type_nos= slice_type & 3;
3711
3712     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3713     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3714         av_log(h->s.avctx, AV_LOG_ERROR,
3715                "B picture before any references, skipping\n");
3716         return -1;
3717     }
3718
3719     pps_id= get_ue_golomb(&s->gb);
3720     if(pps_id>=MAX_PPS_COUNT){
3721         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3722         return -1;
3723     }
3724     if(!h0->pps_buffers[pps_id]) {
3725         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3726         return -1;
3727     }
3728     h->pps= *h0->pps_buffers[pps_id];
3729
3730     if(!h0->sps_buffers[h->pps.sps_id]) {
3731         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3732         return -1;
3733     }
3734     h->sps = *h0->sps_buffers[h->pps.sps_id];
3735
3736     if(h == h0 && h->dequant_coeff_pps != pps_id){
3737         h->dequant_coeff_pps = pps_id;
3738         init_dequant_tables(h);
3739     }
3740
3741     s->mb_width= h->sps.mb_width;
3742     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3743
3744     h->b_stride=  s->mb_width*4;
3745     h->b8_stride= s->mb_width*2;
3746
3747     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3748     if(h->sps.frame_mbs_only_flag)
3749         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3750     else
3751         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3752
3753     if (s->context_initialized
3754         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3755         if(h != h0)
3756             return -1;   // width / height changed during parallelized decoding
3757         free_tables(h);
3758         MPV_common_end(s);
3759     }
3760     if (!s->context_initialized) {
3761         if(h != h0)
3762             return -1;  // we cant (re-)initialize context during parallel decoding
3763         if (MPV_common_init(s) < 0)
3764             return -1;
3765         s->first_field = 0;
3766
3767         init_scan_tables(h);
3768         alloc_tables(h);
3769
3770         for(i = 1; i < s->avctx->thread_count; i++) {
3771             H264Context *c;
3772             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3773             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3774             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3775             c->sps = h->sps;
3776             c->pps = h->pps;
3777             init_scan_tables(c);
3778             clone_tables(c, h);
3779         }
3780
3781         for(i = 0; i < s->avctx->thread_count; i++)
3782             if(context_init(h->thread_context[i]) < 0)
3783                 return -1;
3784
3785         s->avctx->width = s->width;
3786         s->avctx->height = s->height;
3787         s->avctx->sample_aspect_ratio= h->sps.sar;
3788         if(!s->avctx->sample_aspect_ratio.den)
3789             s->avctx->sample_aspect_ratio.den = 1;
3790
3791         if(h->sps.timing_info_present_flag){
3792             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3793             if(h->x264_build > 0 && h->x264_build < 44)
3794                 s->avctx->time_base.den *= 2;
3795             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3796                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3797         }
3798     }
3799
3800     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3801
3802     h->mb_mbaff = 0;
3803     h->mb_aff_frame = 0;
3804     last_pic_structure = s0->picture_structure;
3805     if(h->sps.frame_mbs_only_flag){
3806         s->picture_structure= PICT_FRAME;
3807     }else{
3808         if(get_bits1(&s->gb)) { //field_pic_flag
3809             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3810         } else {
3811             s->picture_structure= PICT_FRAME;
3812             h->mb_aff_frame = h->sps.mb_aff;
3813         }
3814     }
3815
3816     if(h0->current_slice == 0){
3817         while(h->frame_num !=  h->prev_frame_num &&
3818               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3819             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3820             frame_start(h);
3821             h->prev_frame_num++;
3822             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3823             s->current_picture_ptr->frame_num= h->prev_frame_num;
3824             execute_ref_pic_marking(h, NULL, 0);
3825         }
3826
3827         /* See if we have a decoded first field looking for a pair... */
3828         if (s0->first_field) {
3829             assert(s0->current_picture_ptr);
3830             assert(s0->current_picture_ptr->data[0]);
3831             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3832
3833             /* figure out if we have a complementary field pair */
3834             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3835                 /*
3836                  * Previous field is unmatched. Don't display it, but let it
3837                  * remain for reference if marked as such.
3838                  */
3839                 s0->current_picture_ptr = NULL;
3840                 s0->first_field = FIELD_PICTURE;
3841
3842             } else {
3843                 if (h->nal_ref_idc &&
3844                         s0->current_picture_ptr->reference &&
3845                         s0->current_picture_ptr->frame_num != h->frame_num) {
3846                     /*
3847                      * This and previous field were reference, but had
3848                      * different frame_nums. Consider this field first in
3849                      * pair. Throw away previous field except for reference
3850                      * purposes.
3851                      */
3852                     s0->first_field = 1;
3853                     s0->current_picture_ptr = NULL;
3854
3855                 } else {
3856                     /* Second field in complementary pair */
3857                     s0->first_field = 0;
3858                 }
3859             }
3860
3861         } else {
3862             /* Frame or first field in a potentially complementary pair */
3863             assert(!s0->current_picture_ptr);
3864             s0->first_field = FIELD_PICTURE;
3865         }
3866
3867         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3868             s0->first_field = 0;
3869             return -1;
3870         }
3871     }
3872     if(h != h0)
3873         clone_slice(h, h0);
3874
3875     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3876
3877     assert(s->mb_num == s->mb_width * s->mb_height);
3878     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3879        first_mb_in_slice                    >= s->mb_num){
3880         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3881         return -1;
3882     }
3883     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3884     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3885     if (s->picture_structure == PICT_BOTTOM_FIELD)
3886         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3887     assert(s->mb_y < s->mb_height);
3888
3889     if(s->picture_structure==PICT_FRAME){
3890         h->curr_pic_num=   h->frame_num;
3891         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3892     }else{
3893         h->curr_pic_num= 2*h->frame_num + 1;
3894         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3895     }
3896
3897     if(h->nal_unit_type == NAL_IDR_SLICE){
3898         get_ue_golomb(&s->gb); /* idr_pic_id */
3899     }
3900
3901     if(h->sps.poc_type==0){
3902         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3903
3904         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3905             h->delta_poc_bottom= get_se_golomb(&s->gb);
3906         }
3907     }
3908
3909     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3910         h->delta_poc[0]= get_se_golomb(&s->gb);
3911
3912         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3913             h->delta_poc[1]= get_se_golomb(&s->gb);
3914     }
3915
3916     init_poc(h);
3917
3918     if(h->pps.redundant_pic_cnt_present){
3919         h->redundant_pic_count= get_ue_golomb(&s->gb);
3920     }
3921
3922     //set defaults, might be overridden a few lines later
3923     h->ref_count[0]= h->pps.ref_count[0];
3924     h->ref_count[1]= h->pps.ref_count[1];
3925
3926     if(h->slice_type_nos != FF_I_TYPE){
3927         if(h->slice_type_nos == FF_B_TYPE){
3928             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3929         }
3930         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3931
3932         if(num_ref_idx_active_override_flag){
3933             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3934             if(h->slice_type_nos==FF_B_TYPE)
3935                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3936
3937             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3938                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3939                 h->ref_count[0]= h->ref_count[1]= 1;
3940                 return -1;
3941             }
3942         }
3943         if(h->slice_type_nos == FF_B_TYPE)
3944             h->list_count= 2;
3945         else
3946             h->list_count= 1;
3947     }else
3948         h->list_count= 0;
3949
3950     if(!default_ref_list_done){
3951         fill_default_ref_list(h);
3952     }
3953
3954     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3955         return -1;
3956
3957     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3958        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3959         pred_weight_table(h);
3960     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3961         implicit_weight_table(h);
3962     else
3963         h->use_weight = 0;
3964
3965     if(h->nal_ref_idc)
3966         decode_ref_pic_marking(h0, &s->gb);
3967
3968     if(FRAME_MBAFF)
3969         fill_mbaff_ref_list(h);
3970
3971     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3972         tmp = get_ue_golomb(&s->gb);
3973         if(tmp > 2){
3974             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3975             return -1;
3976         }
3977         h->cabac_init_idc= tmp;
3978     }
3979
3980     h->last_qscale_diff = 0;
3981     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3982     if(tmp>51){
3983         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3984         return -1;
3985     }
3986     s->qscale= tmp;
3987     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3988     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3989     //FIXME qscale / qp ... stuff
3990     if(h->slice_type == FF_SP_TYPE){
3991         get_bits1(&s->gb); /* sp_for_switch_flag */
3992     }
3993     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3994         get_se_golomb(&s->gb); /* slice_qs_delta */
3995     }
3996
3997     h->deblocking_filter = 1;
3998     h->slice_alpha_c0_offset = 0;
3999     h->slice_beta_offset = 0;
4000     if( h->pps.deblocking_filter_parameters_present ) {
4001         tmp= get_ue_golomb(&s->gb);
4002         if(tmp > 2){
4003             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4004             return -1;
4005         }
4006         h->deblocking_filter= tmp;
4007         if(h->deblocking_filter < 2)
4008             h->deblocking_filter^= 1; // 1<->0
4009
4010         if( h->deblocking_filter ) {
4011             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4012             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4013         }
4014     }
4015
4016     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4017        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4018        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4019        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4020         h->deblocking_filter= 0;
4021
4022     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4023         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4024             /* Cheat slightly for speed:
4025                Do not bother to deblock across slices. */
4026             h->deblocking_filter = 2;
4027         } else {
4028             h0->max_contexts = 1;
4029             if(!h0->single_decode_warning) {
4030                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4031                 h0->single_decode_warning = 1;
4032             }
4033             if(h != h0)
4034                 return 1; // deblocking switched inside frame
4035         }
4036     }
4037
4038 #if 0 //FMO
4039     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4040         slice_group_change_cycle= get_bits(&s->gb, ?);
4041 #endif
4042
4043     h0->last_slice_type = slice_type;
4044     h->slice_num = ++h0->current_slice;
4045
4046     for(j=0; j<2; j++){
4047         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4048         ref2frm[0]=
4049         ref2frm[1]= -1;
4050         for(i=0; i<48; i++)
4051             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4052                           +(h->ref_list[j][i].reference&3);
4053     }
4054
4055     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4056     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4057
4058     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4059         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4060                h->slice_num,
4061                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4062                first_mb_in_slice,
4063                av_get_pict_type_char(h->slice_type),
4064                pps_id, h->frame_num,
4065                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4066                h->ref_count[0], h->ref_count[1],
4067                s->qscale,
4068                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4069                h->use_weight,
4070                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4071                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4072                );
4073     }
4074
4075     return 0;
4076 }
4077
4078 /**
4079  *
4080  */
4081 static inline int get_level_prefix(GetBitContext *gb){
4082     unsigned int buf;
4083     int log;
4084
4085     OPEN_READER(re, gb);
4086     UPDATE_CACHE(re, gb);
4087     buf=GET_CACHE(re, gb);
4088
4089     log= 32 - av_log2(buf);
4090 #ifdef TRACE
4091     print_bin(buf>>(32-log), log);
4092     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4093 #endif
4094
4095     LAST_SKIP_BITS(re, gb, log);
4096     CLOSE_READER(re, gb);
4097
4098     return log-1;
4099 }
4100
4101 static inline int get_dct8x8_allowed(H264Context *h){
4102     int i;
4103     for(i=0; i<4; i++){
4104         if(!IS_SUB_8X8(h->sub_mb_type[i])
4105            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4106             return 0;
4107     }
4108     return 1;
4109 }
4110
4111 /**
4112  * decodes a residual block.
4113  * @param n block index
4114  * @param scantable scantable
4115  * @param max_coeff number of coefficients in the block
4116  * @return <0 if an error occurred
4117  */
4118 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4119     MpegEncContext * const s = &h->s;
4120     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4121     int level[16];
4122     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4123
4124     //FIXME put trailing_onex into the context
4125
4126     if(n == CHROMA_DC_BLOCK_INDEX){
4127         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4128         total_coeff= coeff_token>>2;
4129     }else{
4130         if(n == LUMA_DC_BLOCK_INDEX){
4131             total_coeff= pred_non_zero_count(h, 0);
4132             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4133             total_coeff= coeff_token>>2;
4134         }else{
4135             total_coeff= pred_non_zero_count(h, n);
4136             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4137             total_coeff= coeff_token>>2;
4138             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4139         }
4140     }
4141
4142     //FIXME set last_non_zero?
4143
4144     if(total_coeff==0)
4145         return 0;
4146     if(total_coeff > (unsigned)max_coeff) {
4147         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4148         return -1;
4149     }
4150
4151     trailing_ones= coeff_token&3;
4152     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4153     assert(total_coeff<=16);
4154
4155     for(i=0; i<trailing_ones; i++){
4156         level[i]= 1 - 2*get_bits1(gb);
4157     }
4158
4159     if(i<total_coeff) {
4160         int level_code, mask;
4161         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4162         int prefix= get_level_prefix(gb);
4163
4164         //first coefficient has suffix_length equal to 0 or 1
4165         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4166             if(suffix_length)
4167                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4168             else
4169                 level_code= (prefix<<suffix_length); //part
4170         }else if(prefix==14){
4171             if(suffix_length)
4172                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4173             else
4174                 level_code= prefix + get_bits(gb, 4); //part
4175         }else{
4176             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4177             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4178             if(prefix>=16)
4179                 level_code += (1<<(prefix-3))-4096;
4180         }
4181
4182         if(trailing_ones < 3) level_code += 2;
4183
4184         suffix_length = 1;
4185         if(level_code > 5)
4186             suffix_length++;
4187         mask= -(level_code&1);
4188         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4189         i++;
4190
4191         //remaining coefficients have suffix_length > 0
4192         for(;i<total_coeff;i++) {
4193             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4194             prefix = get_level_prefix(gb);
4195             if(prefix<15){
4196                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4197             }else{
4198                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4199                 if(prefix>=16)
4200                     level_code += (1<<(prefix-3))-4096;
4201             }
4202             mask= -(level_code&1);
4203             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4204             if(level_code > suffix_limit[suffix_length])
4205                 suffix_length++;
4206         }
4207     }
4208
4209     if(total_coeff == max_coeff)
4210         zeros_left=0;
4211     else{
4212         if(n == CHROMA_DC_BLOCK_INDEX)
4213             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4214         else
4215             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4216     }
4217
4218     coeff_num = zeros_left + total_coeff - 1;
4219     j = scantable[coeff_num];
4220     if(n > 24){
4221         block[j] = level[0];
4222         for(i=1;i<total_coeff;i++) {
4223             if(zeros_left <= 0)
4224                 run_before = 0;
4225             else if(zeros_left < 7){
4226                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4227             }else{
4228                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4229             }
4230             zeros_left -= run_before;
4231             coeff_num -= 1 + run_before;
4232             j= scantable[ coeff_num ];
4233
4234             block[j]= level[i];
4235         }
4236     }else{
4237         block[j] = (level[0] * qmul[j] + 32)>>6;
4238         for(i=1;i<total_coeff;i++) {
4239             if(zeros_left <= 0)
4240                 run_before = 0;
4241             else if(zeros_left < 7){
4242                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4243             }else{
4244                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4245             }
4246             zeros_left -= run_before;
4247             coeff_num -= 1 + run_before;
4248             j= scantable[ coeff_num ];
4249
4250             block[j]= (level[i] * qmul[j] + 32)>>6;
4251         }
4252     }
4253
4254     if(zeros_left<0){
4255         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4256         return -1;
4257     }
4258
4259     return 0;
4260 }
4261
4262 static void predict_field_decoding_flag(H264Context *h){
4263     MpegEncContext * const s = &h->s;
4264     const int mb_xy= h->mb_xy;
4265     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4266                 ? s->current_picture.mb_type[mb_xy-1]
4267                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4268                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4269                 : 0;
4270     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4271 }
4272
4273 /**
4274  * decodes a P_SKIP or B_SKIP macroblock
4275  */
4276 static void decode_mb_skip(H264Context *h){
4277     MpegEncContext * const s = &h->s;
4278     const int mb_xy= h->mb_xy;
4279     int mb_type=0;
4280
4281     memset(h->non_zero_count[mb_xy], 0, 16);
4282     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4283
4284     if(MB_FIELD)
4285         mb_type|= MB_TYPE_INTERLACED;
4286
4287     if( h->slice_type_nos == FF_B_TYPE )
4288     {
4289         // just for fill_caches. pred_direct_motion will set the real mb_type
4290         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4291
4292         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4293         pred_direct_motion(h, &mb_type);
4294         mb_type|= MB_TYPE_SKIP;
4295     }
4296     else
4297     {
4298         int mx, my;
4299         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4300
4301         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4302         pred_pskip_motion(h, &mx, &my);
4303         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4304         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4305     }
4306
4307     write_back_motion(h, mb_type);
4308     s->current_picture.mb_type[mb_xy]= mb_type;
4309     s->current_picture.qscale_table[mb_xy]= s->qscale;
4310     h->slice_table[ mb_xy ]= h->slice_num;
4311     h->prev_mb_skipped= 1;
4312 }
4313
4314 /**
4315  * decodes a macroblock
4316  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4317  */
4318 static int decode_mb_cavlc(H264Context *h){
4319     MpegEncContext * const s = &h->s;
4320     int mb_xy;
4321     int partition_count;
4322     unsigned int mb_type, cbp;
4323     int dct8x8_allowed= h->pps.transform_8x8_mode;
4324
4325     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4326
4327     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4328
4329     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4330     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4331                 down the code */
4332     if(h->slice_type_nos != FF_I_TYPE){
4333         if(s->mb_skip_run==-1)
4334             s->mb_skip_run= get_ue_golomb(&s->gb);
4335
4336         if (s->mb_skip_run--) {
4337             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4338                 if(s->mb_skip_run==0)
4339                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4340                 else
4341                     predict_field_decoding_flag(h);
4342             }
4343             decode_mb_skip(h);
4344             return 0;
4345         }
4346     }
4347     if(FRAME_MBAFF){
4348         if( (s->mb_y&1) == 0 )
4349             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4350     }else
4351         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4352
4353     h->prev_mb_skipped= 0;
4354
4355     mb_type= get_ue_golomb(&s->gb);
4356     if(h->slice_type_nos == FF_B_TYPE){
4357         if(mb_type < 23){
4358             partition_count= b_mb_type_info[mb_type].partition_count;
4359             mb_type=         b_mb_type_info[mb_type].type;
4360         }else{
4361             mb_type -= 23;
4362             goto decode_intra_mb;
4363         }
4364     }else if(h->slice_type_nos == FF_P_TYPE){
4365         if(mb_type < 5){
4366             partition_count= p_mb_type_info[mb_type].partition_count;
4367             mb_type=         p_mb_type_info[mb_type].type;
4368         }else{
4369             mb_type -= 5;
4370             goto decode_intra_mb;
4371         }
4372     }else{
4373        assert(h->slice_type_nos == FF_I_TYPE);
4374         if(h->slice_type == FF_SI_TYPE && mb_type)
4375             mb_type--;
4376 decode_intra_mb:
4377         if(mb_type > 25){
4378             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4379             return -1;
4380         }
4381         partition_count=0;
4382         cbp= i_mb_type_info[mb_type].cbp;
4383         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4384         mb_type= i_mb_type_info[mb_type].type;
4385     }
4386
4387     if(MB_FIELD)
4388         mb_type |= MB_TYPE_INTERLACED;
4389
4390     h->slice_table[ mb_xy ]= h->slice_num;
4391
4392     if(IS_INTRA_PCM(mb_type)){
4393         unsigned int x;
4394
4395         // We assume these blocks are very rare so we do not optimize it.
4396         align_get_bits(&s->gb);
4397
4398         // The pixels are stored in the same order as levels in h->mb array.
4399         for(x=0; x < (CHROMA ? 384 : 256); x++){
4400             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4401         }
4402
4403         // In deblocking, the quantizer is 0
4404         s->current_picture.qscale_table[mb_xy]= 0;
4405         // All coeffs are present
4406         memset(h->non_zero_count[mb_xy], 16, 16);
4407
4408         s->current_picture.mb_type[mb_xy]= mb_type;
4409         return 0;
4410     }
4411
4412     if(MB_MBAFF){
4413         h->ref_count[0] <<= 1;
4414         h->ref_count[1] <<= 1;
4415     }
4416
4417     fill_caches(h, mb_type, 0);
4418
4419     //mb_pred
4420     if(IS_INTRA(mb_type)){
4421         int pred_mode;
4422 //            init_top_left_availability(h);
4423         if(IS_INTRA4x4(mb_type)){
4424             int i;
4425             int di = 1;
4426             if(dct8x8_allowed && get_bits1(&s->gb)){
4427                 mb_type |= MB_TYPE_8x8DCT;
4428                 di = 4;
4429             }
4430
4431 //                fill_intra4x4_pred_table(h);
4432             for(i=0; i<16; i+=di){
4433                 int mode= pred_intra_mode(h, i);
4434
4435                 if(!get_bits1(&s->gb)){
4436                     const int rem_mode= get_bits(&s->gb, 3);
4437                     mode = rem_mode + (rem_mode >= mode);
4438                 }
4439
4440                 if(di==4)
4441                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4442                 else
4443                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4444             }
4445             write_back_intra_pred_mode(h);
4446             if( check_intra4x4_pred_mode(h) < 0)
4447                 return -1;
4448         }else{
4449             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4450             if(h->intra16x16_pred_mode < 0)
4451                 return -1;
4452         }
4453         if(CHROMA){
4454             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4455             if(pred_mode < 0)
4456                 return -1;
4457             h->chroma_pred_mode= pred_mode;
4458         }
4459     }else if(partition_count==4){
4460         int i, j, sub_partition_count[4], list, ref[2][4];
4461
4462         if(h->slice_type_nos == FF_B_TYPE){
4463             for(i=0; i<4; i++){
4464                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4465                 if(h->sub_mb_type[i] >=13){
4466                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4467                     return -1;
4468                 }
4469                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4470                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4471             }
4472             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4473                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4474                 pred_direct_motion(h, &mb_type);
4475                 h->ref_cache[0][scan8[4]] =
4476                 h->ref_cache[1][scan8[4]] =
4477                 h->ref_cache[0][scan8[12]] =
4478                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4479             }
4480         }else{
4481             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4482             for(i=0; i<4; i++){
4483                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4484                 if(h->sub_mb_type[i] >=4){
4485                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4486                     return -1;
4487                 }
4488                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4489                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4490             }
4491         }
4492
4493         for(list=0; list<h->list_count; list++){
4494             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4495             for(i=0; i<4; i++){
4496                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4497                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4498                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4499                     if(tmp>=ref_count){
4500                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4501                         return -1;
4502                     }
4503                     ref[list][i]= tmp;
4504                 }else{
4505                  //FIXME
4506                     ref[list][i] = -1;
4507                 }
4508             }
4509         }
4510
4511         if(dct8x8_allowed)
4512             dct8x8_allowed = get_dct8x8_allowed(h);
4513
4514         for(list=0; list<h->list_count; list++){
4515             for(i=0; i<4; i++){
4516                 if(IS_DIRECT(h->sub_mb_type[i])) {
4517                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4518                     continue;
4519                 }
4520                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4521                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4522
4523                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4524                     const int sub_mb_type= h->sub_mb_type[i];
4525                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4526                     for(j=0; j<sub_partition_count[i]; j++){
4527                         int mx, my;
4528                         const int index= 4*i + block_width*j;
4529                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4530                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4531                         mx += get_se_golomb(&s->gb);
4532                         my += get_se_golomb(&s->gb);
4533                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4534
4535                         if(IS_SUB_8X8(sub_mb_type)){
4536                             mv_cache[ 1 ][0]=
4537                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4538                             mv_cache[ 1 ][1]=
4539                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4540                         }else if(IS_SUB_8X4(sub_mb_type)){
4541                             mv_cache[ 1 ][0]= mx;
4542                             mv_cache[ 1 ][1]= my;
4543                         }else if(IS_SUB_4X8(sub_mb_type)){
4544                             mv_cache[ 8 ][0]= mx;
4545                             mv_cache[ 8 ][1]= my;
4546                         }
4547                         mv_cache[ 0 ][0]= mx;
4548                         mv_cache[ 0 ][1]= my;
4549                     }
4550                 }else{
4551                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4552                     p[0] = p[1]=
4553                     p[8] = p[9]= 0;
4554                 }
4555             }
4556         }
4557     }else if(IS_DIRECT(mb_type)){
4558         pred_direct_motion(h, &mb_type);
4559         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4560     }else{
4561         int list, mx, my, i;
4562          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4563         if(IS_16X16(mb_type)){
4564             for(list=0; list<h->list_count; list++){
4565                     unsigned int val;
4566                     if(IS_DIR(mb_type, 0, list)){
4567                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4568                         if(val >= h->ref_count[list]){
4569                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4570                             return -1;
4571                         }
4572                     }else
4573                         val= LIST_NOT_USED&0xFF;
4574                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4575             }
4576             for(list=0; list<h->list_count; list++){
4577                 unsigned int val;
4578                 if(IS_DIR(mb_type, 0, list)){
4579                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4580                     mx += get_se_golomb(&s->gb);
4581                     my += get_se_golomb(&s->gb);
4582                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4583
4584                     val= pack16to32(mx,my);
4585                 }else
4586                     val=0;
4587                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4588             }
4589         }
4590         else if(IS_16X8(mb_type)){
4591             for(list=0; list<h->list_count; list++){
4592                     for(i=0; i<2; i++){
4593                         unsigned int val;
4594                         if(IS_DIR(mb_type, i, list)){
4595                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4596                             if(val >= h->ref_count[list]){
4597                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4598                                 return -1;
4599                             }
4600                         }else
4601                             val= LIST_NOT_USED&0xFF;
4602                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4603                     }
4604             }
4605             for(list=0; list<h->list_count; list++){
4606                 for(i=0; i<2; i++){
4607                     unsigned int val;
4608                     if(IS_DIR(mb_type, i, list)){
4609                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4610                         mx += get_se_golomb(&s->gb);
4611                         my += get_se_golomb(&s->gb);
4612                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4613
4614                         val= pack16to32(mx,my);
4615                     }else
4616                         val=0;
4617                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4618                 }
4619             }
4620         }else{
4621             assert(IS_8X16(mb_type));
4622             for(list=0; list<h->list_count; list++){
4623                     for(i=0; i<2; i++){
4624                         unsigned int val;
4625                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4626                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4627                             if(val >= h->ref_count[list]){
4628                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4629                                 return -1;
4630                             }
4631                         }else
4632                             val= LIST_NOT_USED&0xFF;
4633                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4634                     }
4635             }
4636             for(list=0; list<h->list_count; list++){
4637                 for(i=0; i<2; i++){
4638                     unsigned int val;
4639                     if(IS_DIR(mb_type, i, list)){
4640                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4641                         mx += get_se_golomb(&s->gb);
4642                         my += get_se_golomb(&s->gb);
4643                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4644
4645                         val= pack16to32(mx,my);
4646                     }else
4647                         val=0;
4648                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4649                 }
4650             }
4651         }
4652     }
4653
4654     if(IS_INTER(mb_type))
4655         write_back_motion(h, mb_type);
4656
4657     if(!IS_INTRA16x16(mb_type)){
4658         cbp= get_ue_golomb(&s->gb);
4659         if(cbp > 47){
4660             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4661             return -1;
4662         }
4663
4664         if(CHROMA){
4665             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4666             else                     cbp= golomb_to_inter_cbp   [cbp];
4667         }else{
4668             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4669             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4670         }
4671     }
4672     h->cbp = cbp;
4673
4674     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4675         if(get_bits1(&s->gb)){
4676             mb_type |= MB_TYPE_8x8DCT;
4677             h->cbp_table[mb_xy]= cbp;
4678         }
4679     }
4680     s->current_picture.mb_type[mb_xy]= mb_type;
4681
4682     if(cbp || IS_INTRA16x16(mb_type)){
4683         int i8x8, i4x4, chroma_idx;
4684         int dquant;
4685         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4686         const uint8_t *scan, *scan8x8, *dc_scan;
4687
4688 //        fill_non_zero_count_cache(h);
4689
4690         if(IS_INTERLACED(mb_type)){
4691             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4692             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4693             dc_scan= luma_dc_field_scan;
4694         }else{
4695             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4696             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4697             dc_scan= luma_dc_zigzag_scan;
4698         }
4699
4700         dquant= get_se_golomb(&s->gb);
4701
4702         if( dquant > 25 || dquant < -26 ){
4703             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4704             return -1;
4705         }
4706
4707         s->qscale += dquant;
4708         if(((unsigned)s->qscale) > 51){
4709             if(s->qscale<0) s->qscale+= 52;
4710             else            s->qscale-= 52;
4711         }
4712
4713         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4714         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4715         if(IS_INTRA16x16(mb_type)){
4716             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4717                 return -1; //FIXME continue if partitioned and other return -1 too
4718             }
4719
4720             assert((cbp&15) == 0 || (cbp&15) == 15);
4721
4722             if(cbp&15){
4723                 for(i8x8=0; i8x8<4; i8x8++){
4724                     for(i4x4=0; i4x4<4; i4x4++){
4725                         const int index= i4x4 + 4*i8x8;
4726                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4727                             return -1;
4728                         }
4729                     }
4730                 }
4731             }else{
4732                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4733             }
4734         }else{
4735             for(i8x8=0; i8x8<4; i8x8++){
4736                 if(cbp & (1<<i8x8)){
4737                     if(IS_8x8DCT(mb_type)){
4738                         DCTELEM *buf = &h->mb[64*i8x8];
4739                         uint8_t *nnz;
4740                         for(i4x4=0; i4x4<4; i4x4++){
4741                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4742                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4743                                 return -1;
4744                         }
4745                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4746                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4747                     }else{
4748                         for(i4x4=0; i4x4<4; i4x4++){
4749                             const int index= i4x4 + 4*i8x8;
4750
4751                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4752                                 return -1;
4753                             }
4754                         }
4755                     }
4756                 }else{
4757                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4758                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4759                 }
4760             }
4761         }
4762
4763         if(cbp&0x30){
4764             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4765                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4766                     return -1;
4767                 }
4768         }
4769
4770         if(cbp&0x20){
4771             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4772                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4773                 for(i4x4=0; i4x4<4; i4x4++){
4774                     const int index= 16 + 4*chroma_idx + i4x4;
4775                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4776                         return -1;
4777                     }
4778                 }
4779             }
4780         }else{
4781             uint8_t * const nnz= &h->non_zero_count_cache[0];
4782             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4783             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4784         }
4785     }else{
4786         uint8_t * const nnz= &h->non_zero_count_cache[0];
4787         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4788         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4789         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4790     }
4791     s->current_picture.qscale_table[mb_xy]= s->qscale;
4792     write_back_non_zero_count(h);
4793
4794     if(MB_MBAFF){
4795         h->ref_count[0] >>= 1;
4796         h->ref_count[1] >>= 1;
4797     }
4798
4799     return 0;
4800 }
4801
4802 static int decode_cabac_field_decoding_flag(H264Context *h) {
4803     MpegEncContext * const s = &h->s;
4804     const int mb_x = s->mb_x;
4805     const int mb_y = s->mb_y & ~1;
4806     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4807     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4808
4809     unsigned int ctx = 0;
4810
4811     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4812         ctx += 1;
4813     }
4814     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4815         ctx += 1;
4816     }
4817
4818     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4819 }
4820
4821 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4822     uint8_t *state= &h->cabac_state[ctx_base];
4823     int mb_type;
4824
4825     if(intra_slice){
4826         MpegEncContext * const s = &h->s;
4827         const int mba_xy = h->left_mb_xy[0];
4828         const int mbb_xy = h->top_mb_xy;
4829         int ctx=0;
4830         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4831             ctx++;
4832         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4833             ctx++;
4834         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4835             return 0;   /* I4x4 */
4836         state += 2;
4837     }else{
4838         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4839             return 0;   /* I4x4 */
4840     }
4841
4842     if( get_cabac_terminate( &h->cabac ) )
4843         return 25;  /* PCM */
4844
4845     mb_type = 1; /* I16x16 */
4846     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4847     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4848         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4849     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4850     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4851     return mb_type;
4852 }
4853
4854 static int decode_cabac_mb_type( H264Context *h ) {
4855     MpegEncContext * const s = &h->s;
4856
4857     if( h->slice_type_nos == FF_I_TYPE ) {
4858         return decode_cabac_intra_mb_type(h, 3, 1);
4859     } else if( h->slice_type_nos == FF_P_TYPE ) {
4860         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4861             /* P-type */
4862             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4863                 /* P_L0_D16x16, P_8x8 */
4864                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4865             } else {
4866                 /* P_L0_D8x16, P_L0_D16x8 */
4867                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4868             }
4869         } else {
4870             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4871         }
4872     } else if( h->slice_type_nos == FF_B_TYPE ) {
4873         const int mba_xy = h->left_mb_xy[0];
4874         const int mbb_xy = h->top_mb_xy;
4875         int ctx = 0;
4876         int bits;
4877
4878         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4879             ctx++;
4880         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4881             ctx++;
4882
4883         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4884             return 0; /* B_Direct_16x16 */
4885
4886         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4887             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4888         }
4889
4890         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4891         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4892         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4893         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4894         if( bits < 8 )
4895             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4896         else if( bits == 13 ) {
4897             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4898         } else if( bits == 14 )
4899             return 11; /* B_L1_L0_8x16 */
4900         else if( bits == 15 )
4901             return 22; /* B_8x8 */
4902
4903         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4904         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4905     } else {
4906         /* TODO SI/SP frames? */
4907         return -1;
4908     }
4909 }
4910
4911 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4912     MpegEncContext * const s = &h->s;
4913     int mba_xy, mbb_xy;
4914     int ctx = 0;
4915
4916     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4917         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4918         mba_xy = mb_xy - 1;
4919         if( (mb_y&1)
4920             && h->slice_table[mba_xy] == h->slice_num
4921             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4922             mba_xy += s->mb_stride;
4923         if( MB_FIELD ){
4924             mbb_xy = mb_xy - s->mb_stride;
4925             if( !(mb_y&1)
4926                 && h->slice_table[mbb_xy] == h->slice_num
4927                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4928                 mbb_xy -= s->mb_stride;
4929         }else
4930             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4931     }else{
4932         int mb_xy = h->mb_xy;
4933         mba_xy = mb_xy - 1;
4934         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4935     }
4936
4937     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4938         ctx++;
4939     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4940         ctx++;
4941
4942     if( h->slice_type_nos == FF_B_TYPE )
4943         ctx += 13;
4944     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4945 }
4946
4947 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4948     int mode = 0;
4949
4950     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4951         return pred_mode;
4952
4953     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4954     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4955     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4956
4957     if( mode >= pred_mode )
4958         return mode + 1;
4959     else
4960         return mode;
4961 }
4962
4963 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4964     const int mba_xy = h->left_mb_xy[0];
4965     const int mbb_xy = h->top_mb_xy;
4966
4967     int ctx = 0;
4968
4969     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4970     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4971         ctx++;
4972
4973     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4974         ctx++;
4975
4976     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4977         return 0;
4978
4979     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4980         return 1;
4981     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4982         return 2;
4983     else
4984         return 3;
4985 }
4986
4987 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4988     int cbp_b, cbp_a, ctx, cbp = 0;
4989
4990     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4991     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4992
4993     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4994     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4995     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4996     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4997     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4998     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4999     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5000     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5001     return cbp;
5002 }
5003 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5004     int ctx;
5005     int cbp_a, cbp_b;
5006
5007     cbp_a = (h->left_cbp>>4)&0x03;
5008     cbp_b = (h-> top_cbp>>4)&0x03;
5009
5010     ctx = 0;
5011     if( cbp_a > 0 ) ctx++;
5012     if( cbp_b > 0 ) ctx += 2;
5013     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5014         return 0;
5015
5016     ctx = 4;
5017     if( cbp_a == 2 ) ctx++;
5018     if( cbp_b == 2 ) ctx += 2;
5019     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5020 }
5021 static int decode_cabac_mb_dqp( H264Context *h) {
5022     int   ctx = 0;
5023     int   val = 0;
5024
5025     if( h->last_qscale_diff != 0 )
5026         ctx++;
5027
5028     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5029         if( ctx < 2 )
5030             ctx = 2;
5031         else
5032             ctx = 3;
5033         val++;
5034         if(val > 102) //prevent infinite loop
5035             return INT_MIN;
5036     }
5037
5038     if( val&0x01 )
5039         return (val + 1)/2;
5040     else
5041         return -(val + 1)/2;
5042 }
5043 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5044     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5045         return 0;   /* 8x8 */
5046     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5047         return 1;   /* 8x4 */
5048     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5049         return 2;   /* 4x8 */
5050     return 3;       /* 4x4 */
5051 }
5052 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5053     int type;
5054     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5055         return 0;   /* B_Direct_8x8 */
5056     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5057         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5058     type = 3;
5059     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5060         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5061             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5062         type += 4;
5063     }
5064     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5065     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5066     return type;
5067 }
5068
5069 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5070     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5071 }
5072
5073 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5074     int refa = h->ref_cache[list][scan8[n] - 1];
5075     int refb = h->ref_cache[list][scan8[n] - 8];
5076     int ref  = 0;
5077     int ctx  = 0;
5078
5079     if( h->slice_type_nos == FF_B_TYPE) {
5080         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5081             ctx++;
5082         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5083             ctx += 2;
5084     } else {
5085         if( refa > 0 )
5086             ctx++;
5087         if( refb > 0 )
5088             ctx += 2;
5089     }
5090
5091     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5092         ref++;
5093         if( ctx < 4 )
5094             ctx = 4;
5095         else
5096             ctx = 5;
5097         if(ref >= 32 /*h->ref_list[list]*/){
5098             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5099             return 0; //FIXME we should return -1 and check the return everywhere
5100         }
5101     }
5102     return ref;
5103 }
5104
5105 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5106     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5107                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5108     int ctxbase = (l == 0) ? 40 : 47;
5109     int ctx, mvd;
5110
5111     if( amvd < 3 )
5112         ctx = 0;
5113     else if( amvd > 32 )
5114         ctx = 2;
5115     else
5116         ctx = 1;
5117
5118     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5119         return 0;
5120
5121     mvd= 1;
5122     ctx= 3;
5123     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5124         mvd++;
5125         if( ctx < 6 )
5126             ctx++;
5127     }
5128
5129     if( mvd >= 9 ) {
5130         int k = 3;
5131         while( get_cabac_bypass( &h->cabac ) ) {
5132             mvd += 1 << k;
5133             k++;
5134             if(k>24){
5135                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5136                 return INT_MIN;
5137             }
5138         }
5139         while( k-- ) {
5140             if( get_cabac_bypass( &h->cabac ) )
5141                 mvd += 1 << k;
5142         }
5143     }
5144     return get_cabac_bypass_sign( &h->cabac, -mvd );
5145 }
5146
5147 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5148     int nza, nzb;
5149     int ctx = 0;
5150
5151     if( is_dc ) {
5152         if( cat == 0 ) {
5153             nza = h->left_cbp&0x100;
5154             nzb = h-> top_cbp&0x100;
5155         } else {
5156             nza = (h->left_cbp>>(6+idx))&0x01;
5157             nzb = (h-> top_cbp>>(6+idx))&0x01;
5158         }
5159     } else {
5160         if( cat == 4 ) {
5161             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5162             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5163         } else {
5164             assert(cat == 1 || cat == 2);
5165             nza = h->non_zero_count_cache[scan8[idx] - 1];
5166             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5167         }
5168     }
5169
5170     if( nza > 0 )
5171         ctx++;
5172
5173     if( nzb > 0 )
5174         ctx += 2;
5175
5176     return ctx + 4 * cat;
5177 }
5178
5179 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5180     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5181     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5182     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5183     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5184 };
5185
5186 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5187     static const int significant_coeff_flag_offset[2][6] = {
5188       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5189       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5190     };
5191     static const int last_coeff_flag_offset[2][6] = {
5192       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5193       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5194     };
5195     static const int coeff_abs_level_m1_offset[6] = {
5196         227+0, 227+10, 227+20, 227+30, 227+39, 426
5197     };
5198     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5199       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5200         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5201         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5202        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5203       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5204         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5205         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5206         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5207     };
5208     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5209      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5210      * map node ctx => cabac ctx for level=1 */
5211     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5212     /* map node ctx => cabac ctx for level>1 */
5213     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5214     static const uint8_t coeff_abs_level_transition[2][8] = {
5215     /* update node ctx after decoding a level=1 */
5216         { 1, 2, 3, 3, 4, 5, 6, 7 },
5217     /* update node ctx after decoding a level>1 */
5218         { 4, 4, 4, 4, 5, 6, 7, 7 }
5219     };
5220
5221     int index[64];
5222
5223     int av_unused last;
5224     int coeff_count = 0;
5225     int node_ctx = 0;
5226
5227     uint8_t *significant_coeff_ctx_base;
5228     uint8_t *last_coeff_ctx_base;
5229     uint8_t *abs_level_m1_ctx_base;
5230
5231 #ifndef ARCH_X86
5232 #define CABAC_ON_STACK
5233 #endif
5234 #ifdef CABAC_ON_STACK
5235 #define CC &cc
5236     CABACContext cc;
5237     cc.range     = h->cabac.range;
5238     cc.low       = h->cabac.low;
5239     cc.bytestream= h->cabac.bytestream;
5240 #else
5241 #define CC &h->cabac
5242 #endif
5243
5244
5245     /* cat: 0-> DC 16x16  n = 0
5246      *      1-> AC 16x16  n = luma4x4idx
5247      *      2-> Luma4x4   n = luma4x4idx
5248      *      3-> DC Chroma n = iCbCr
5249      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5250      *      5-> Luma8x8   n = 4 * luma8x8idx
5251      */
5252
5253     /* read coded block flag */
5254     if( is_dc || cat != 5 ) {
5255         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5256             if( !is_dc ) {
5257                 if( cat == 4 )
5258                     h->non_zero_count_cache[scan8[16+n]] = 0;
5259                 else
5260                     h->non_zero_count_cache[scan8[n]] = 0;
5261             }
5262
5263 #ifdef CABAC_ON_STACK
5264             h->cabac.range     = cc.range     ;
5265             h->cabac.low       = cc.low       ;
5266             h->cabac.bytestream= cc.bytestream;
5267 #endif
5268             return;
5269         }
5270     }
5271
5272     significant_coeff_ctx_base = h->cabac_state
5273         + significant_coeff_flag_offset[MB_FIELD][cat];
5274     last_coeff_ctx_base = h->cabac_state
5275         + last_coeff_flag_offset[MB_FIELD][cat];
5276     abs_level_m1_ctx_base = h->cabac_state
5277         + coeff_abs_level_m1_offset[cat];
5278
5279     if( !is_dc && cat == 5 ) {
5280 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5281         for(last= 0; last < coefs; last++) { \
5282             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5283             if( get_cabac( CC, sig_ctx )) { \
5284                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5285                 index[coeff_count++] = last; \
5286                 if( get_cabac( CC, last_ctx ) ) { \
5287                     last= max_coeff; \
5288                     break; \
5289                 } \
5290             } \
5291         }\
5292         if( last == max_coeff -1 ) {\
5293             index[coeff_count++] = last;\
5294         }
5295         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5296 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5297         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5298     } else {
5299         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5300 #else
5301         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5302     } else {
5303         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5304 #endif
5305     }
5306     assert(coeff_count > 0);
5307
5308     if( is_dc ) {
5309         if( cat == 0 )
5310             h->cbp_table[h->mb_xy] |= 0x100;
5311         else
5312             h->cbp_table[h->mb_xy] |= 0x40 << n;
5313     } else {
5314         if( cat == 5 )
5315             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5316         else if( cat == 4 )
5317             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5318         else {
5319             assert( cat == 1 || cat == 2 );
5320             h->non_zero_count_cache[scan8[n]] = coeff_count;
5321         }
5322     }
5323
5324     do {
5325         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5326
5327         int j= scantable[index[--coeff_count]];
5328
5329         if( get_cabac( CC, ctx ) == 0 ) {
5330             node_ctx = coeff_abs_level_transition[0][node_ctx];
5331             if( is_dc ) {
5332                 block[j] = get_cabac_bypass_sign( CC, -1);
5333             }else{
5334                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5335             }
5336         } else {
5337             int coeff_abs = 2;
5338             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5339             node_ctx = coeff_abs_level_transition[1][node_ctx];
5340
5341             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5342                 coeff_abs++;
5343             }
5344
5345             if( coeff_abs >= 15 ) {
5346                 int j = 0;
5347                 while( get_cabac_bypass( CC ) ) {
5348                     j++;
5349                 }
5350
5351                 coeff_abs=1;
5352                 while( j-- ) {
5353                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5354                 }
5355                 coeff_abs+= 14;
5356             }
5357
5358             if( is_dc ) {
5359                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5360             }else{
5361                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5362             }
5363         }
5364     } while( coeff_count );
5365 #ifdef CABAC_ON_STACK
5366             h->cabac.range     = cc.range     ;
5367             h->cabac.low       = cc.low       ;
5368             h->cabac.bytestream= cc.bytestream;
5369 #endif
5370
5371 }
5372
5373 #ifndef CONFIG_SMALL
5374 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5375     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5376 }
5377
5378 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5379     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5380 }
5381 #endif
5382
5383 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5384 #ifdef CONFIG_SMALL
5385     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5386 #else
5387     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5388     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5389 #endif
5390 }
5391
5392 static inline void compute_mb_neighbors(H264Context *h)
5393 {
5394     MpegEncContext * const s = &h->s;
5395     const int mb_xy  = h->mb_xy;
5396     h->top_mb_xy     = mb_xy - s->mb_stride;
5397     h->left_mb_xy[0] = mb_xy - 1;
5398     if(FRAME_MBAFF){
5399         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5400         const int top_pair_xy      = pair_xy     - s->mb_stride;
5401         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5402         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5403         const int curr_mb_frame_flag = !MB_FIELD;
5404         const int bottom = (s->mb_y & 1);
5405         if (bottom
5406                 ? !curr_mb_frame_flag // bottom macroblock
5407                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5408                 ) {
5409             h->top_mb_xy -= s->mb_stride;
5410         }
5411         if (left_mb_frame_flag != curr_mb_frame_flag) {
5412             h->left_mb_xy[0] = pair_xy - 1;
5413         }
5414     } else if (FIELD_PICTURE) {
5415         h->top_mb_xy -= s->mb_stride;
5416     }
5417     return;
5418 }
5419
5420 /**
5421  * decodes a macroblock
5422  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5423  */
5424 static int decode_mb_cabac(H264Context *h) {
5425     MpegEncContext * const s = &h->s;
5426     int mb_xy;
5427     int mb_type, partition_count, cbp = 0;
5428     int dct8x8_allowed= h->pps.transform_8x8_mode;
5429
5430     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5431
5432     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5433
5434     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5435     if( h->slice_type_nos != FF_I_TYPE ) {
5436         int skip;
5437         /* a skipped mb needs the aff flag from the following mb */
5438         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5439             predict_field_decoding_flag(h);
5440         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5441             skip = h->next_mb_skipped;
5442         else
5443             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5444         /* read skip flags */
5445         if( skip ) {
5446             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5447                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5448                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5449                 if(h->next_mb_skipped)
5450                     predict_field_decoding_flag(h);
5451                 else
5452                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5453             }
5454
5455             decode_mb_skip(h);
5456
5457             h->cbp_table[mb_xy] = 0;
5458             h->chroma_pred_mode_table[mb_xy] = 0;
5459             h->last_qscale_diff = 0;
5460
5461             return 0;
5462
5463         }
5464     }
5465     if(FRAME_MBAFF){
5466         if( (s->mb_y&1) == 0 )
5467             h->mb_mbaff =
5468             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5469     }else
5470         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5471
5472     h->prev_mb_skipped = 0;
5473
5474     compute_mb_neighbors(h);
5475     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5476         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5477         return -1;
5478     }
5479
5480     if( h->slice_type_nos == FF_B_TYPE ) {
5481         if( mb_type < 23 ){
5482             partition_count= b_mb_type_info[mb_type].partition_count;
5483             mb_type=         b_mb_type_info[mb_type].type;
5484         }else{
5485             mb_type -= 23;
5486             goto decode_intra_mb;
5487         }
5488     } else if( h->slice_type_nos == FF_P_TYPE ) {
5489         if( mb_type < 5) {
5490             partition_count= p_mb_type_info[mb_type].partition_count;
5491             mb_type=         p_mb_type_info[mb_type].type;
5492         } else {
5493             mb_type -= 5;
5494             goto decode_intra_mb;
5495         }
5496     } else {
5497         if(h->slice_type == FF_SI_TYPE && mb_type)
5498             mb_type--;
5499         assert(h->slice_type_nos == FF_I_TYPE);
5500 decode_intra_mb:
5501         partition_count = 0;
5502         cbp= i_mb_type_info[mb_type].cbp;
5503         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5504         mb_type= i_mb_type_info[mb_type].type;
5505     }
5506     if(MB_FIELD)
5507         mb_type |= MB_TYPE_INTERLACED;
5508
5509     h->slice_table[ mb_xy ]= h->slice_num;
5510
5511     if(IS_INTRA_PCM(mb_type)) {
5512         const uint8_t *ptr;
5513
5514         // We assume these blocks are very rare so we do not optimize it.
5515         // FIXME The two following lines get the bitstream position in the cabac
5516         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5517         ptr= h->cabac.bytestream;
5518         if(h->cabac.low&0x1) ptr--;
5519         if(CABAC_BITS==16){
5520             if(h->cabac.low&0x1FF) ptr--;
5521         }
5522
5523         // The pixels are stored in the same order as levels in h->mb array.
5524         memcpy(h->mb, ptr, 256); ptr+=256;
5525         if(CHROMA){
5526             memcpy(h->mb+128, ptr, 128); ptr+=128;
5527         }
5528
5529         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5530
5531         // All blocks are present
5532         h->cbp_table[mb_xy] = 0x1ef;
5533         h->chroma_pred_mode_table[mb_xy] = 0;
5534         // In deblocking, the quantizer is 0
5535         s->current_picture.qscale_table[mb_xy]= 0;
5536         // All coeffs are present
5537         memset(h->non_zero_count[mb_xy], 16, 16);
5538         s->current_picture.mb_type[mb_xy]= mb_type;
5539         h->last_qscale_diff = 0;
5540         return 0;
5541     }
5542
5543     if(MB_MBAFF){
5544         h->ref_count[0] <<= 1;
5545         h->ref_count[1] <<= 1;
5546     }
5547
5548     fill_caches(h, mb_type, 0);
5549
5550     if( IS_INTRA( mb_type ) ) {
5551         int i, pred_mode;
5552         if( IS_INTRA4x4( mb_type ) ) {
5553             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5554                 mb_type |= MB_TYPE_8x8DCT;
5555                 for( i = 0; i < 16; i+=4 ) {
5556                     int pred = pred_intra_mode( h, i );
5557                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5558                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5559                 }
5560             } else {
5561                 for( i = 0; i < 16; i++ ) {
5562                     int pred = pred_intra_mode( h, i );
5563                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5564
5565                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5566                 }
5567             }
5568             write_back_intra_pred_mode(h);
5569             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5570         } else {
5571             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5572             if( h->intra16x16_pred_mode < 0 ) return -1;
5573         }
5574         if(CHROMA){
5575             h->chroma_pred_mode_table[mb_xy] =
5576             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5577
5578             pred_mode= check_intra_pred_mode( h, pred_mode );
5579             if( pred_mode < 0 ) return -1;
5580             h->chroma_pred_mode= pred_mode;
5581         }
5582     } else if( partition_count == 4 ) {
5583         int i, j, sub_partition_count[4], list, ref[2][4];
5584
5585         if( h->slice_type_nos == FF_B_TYPE ) {
5586             for( i = 0; i < 4; i++ ) {
5587                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5588                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5589                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5590             }
5591             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5592                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5593                 pred_direct_motion(h, &mb_type);
5594                 h->ref_cache[0][scan8[4]] =
5595                 h->ref_cache[1][scan8[4]] =
5596                 h->ref_cache[0][scan8[12]] =
5597                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5598                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5599                     for( i = 0; i < 4; i++ )
5600                         if( IS_DIRECT(h->sub_mb_type[i]) )
5601                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5602                 }
5603             }
5604         } else {
5605             for( i = 0; i < 4; i++ ) {
5606                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5607                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5608                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5609             }
5610         }
5611
5612         for( list = 0; list < h->list_count; list++ ) {
5613                 for( i = 0; i < 4; i++ ) {
5614                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5615                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5616                         if( h->ref_count[list] > 1 )
5617                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5618                         else
5619                             ref[list][i] = 0;
5620                     } else {
5621                         ref[list][i] = -1;
5622                     }
5623                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5624                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5625                 }
5626         }
5627
5628         if(dct8x8_allowed)
5629             dct8x8_allowed = get_dct8x8_allowed(h);
5630
5631         for(list=0; list<h->list_count; list++){
5632             for(i=0; i<4; i++){
5633                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5634                 if(IS_DIRECT(h->sub_mb_type[i])){
5635                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5636                     continue;
5637                 }
5638
5639                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5640                     const int sub_mb_type= h->sub_mb_type[i];
5641                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5642                     for(j=0; j<sub_partition_count[i]; j++){
5643                         int mpx, mpy;
5644                         int mx, my;
5645                         const int index= 4*i + block_width*j;
5646                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5647                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5648                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5649
5650                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5651                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5652                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5653
5654                         if(IS_SUB_8X8(sub_mb_type)){
5655                             mv_cache[ 1 ][0]=
5656                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5657                             mv_cache[ 1 ][1]=
5658                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5659
5660                             mvd_cache[ 1 ][0]=
5661                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5662                             mvd_cache[ 1 ][1]=
5663                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5664                         }else if(IS_SUB_8X4(sub_mb_type)){
5665                             mv_cache[ 1 ][0]= mx;
5666                             mv_cache[ 1 ][1]= my;
5667
5668                             mvd_cache[ 1 ][0]= mx - mpx;
5669                             mvd_cache[ 1 ][1]= my - mpy;
5670                         }else if(IS_SUB_4X8(sub_mb_type)){
5671                             mv_cache[ 8 ][0]= mx;
5672                             mv_cache[ 8 ][1]= my;
5673
5674                             mvd_cache[ 8 ][0]= mx - mpx;
5675                             mvd_cache[ 8 ][1]= my - mpy;
5676                         }
5677                         mv_cache[ 0 ][0]= mx;
5678                         mv_cache[ 0 ][1]= my;
5679
5680                         mvd_cache[ 0 ][0]= mx - mpx;
5681                         mvd_cache[ 0 ][1]= my - mpy;
5682                     }
5683                 }else{
5684                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5685                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5686                     p[0] = p[1] = p[8] = p[9] = 0;
5687                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5688                 }
5689             }
5690         }
5691     } else if( IS_DIRECT(mb_type) ) {
5692         pred_direct_motion(h, &mb_type);
5693         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5694         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5695         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5696     } else {
5697         int list, mx, my, i, mpx, mpy;
5698         if(IS_16X16(mb_type)){
5699             for(list=0; list<h->list_count; list++){
5700                 if(IS_DIR(mb_type, 0, list)){
5701                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5702                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5703                 }else
5704                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5705             }
5706             for(list=0; list<h->list_count; list++){
5707                 if(IS_DIR(mb_type, 0, list)){
5708                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5709
5710                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5711                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5712                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5713
5714                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5715                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5716                 }else
5717                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5718             }
5719         }
5720         else if(IS_16X8(mb_type)){
5721             for(list=0; list<h->list_count; list++){
5722                     for(i=0; i<2; i++){
5723                         if(IS_DIR(mb_type, i, list)){
5724                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5725                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5726                         }else
5727                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5728                     }
5729             }
5730             for(list=0; list<h->list_count; list++){
5731                 for(i=0; i<2; i++){
5732                     if(IS_DIR(mb_type, i, list)){
5733                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5734                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5735                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5736                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5737
5738                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5739                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5740                     }else{
5741                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5742                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5743                     }
5744                 }
5745             }
5746         }else{
5747             assert(IS_8X16(mb_type));
5748             for(list=0; list<h->list_count; list++){
5749                     for(i=0; i<2; i++){
5750                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5751                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5752                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5753                         }else
5754                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5755                     }
5756             }
5757             for(list=0; list<h->list_count; list++){
5758                 for(i=0; i<2; i++){
5759                     if(IS_DIR(mb_type, i, list)){
5760                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5761                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5762                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5763
5764                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5765                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5766                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5767                     }else{
5768                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5769                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5770                     }
5771                 }
5772             }
5773         }
5774     }
5775
5776    if( IS_INTER( mb_type ) ) {
5777         h->chroma_pred_mode_table[mb_xy] = 0;
5778         write_back_motion( h, mb_type );
5779    }
5780
5781     if( !IS_INTRA16x16( mb_type ) ) {
5782         cbp  = decode_cabac_mb_cbp_luma( h );
5783         if(CHROMA)
5784             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5785     }
5786
5787     h->cbp_table[mb_xy] = h->cbp = cbp;
5788
5789     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5790         if( decode_cabac_mb_transform_size( h ) )
5791             mb_type |= MB_TYPE_8x8DCT;
5792     }
5793     s->current_picture.mb_type[mb_xy]= mb_type;
5794
5795     if( cbp || IS_INTRA16x16( mb_type ) ) {
5796         const uint8_t *scan, *scan8x8, *dc_scan;
5797         const uint32_t *qmul;
5798         int dqp;
5799
5800         if(IS_INTERLACED(mb_type)){
5801             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5802             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5803             dc_scan= luma_dc_field_scan;
5804         }else{
5805             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5806             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5807             dc_scan= luma_dc_zigzag_scan;
5808         }
5809
5810         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5811         if( dqp == INT_MIN ){
5812             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5813             return -1;
5814         }
5815         s->qscale += dqp;
5816         if(((unsigned)s->qscale) > 51){
5817             if(s->qscale<0) s->qscale+= 52;
5818             else            s->qscale-= 52;
5819         }
5820         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5821         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5822
5823         if( IS_INTRA16x16( mb_type ) ) {
5824             int i;
5825             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5826             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5827
5828             if( cbp&15 ) {
5829                 qmul = h->dequant4_coeff[0][s->qscale];
5830                 for( i = 0; i < 16; i++ ) {
5831                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5832                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5833                 }
5834             } else {
5835                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5836             }
5837         } else {
5838             int i8x8, i4x4;
5839             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5840                 if( cbp & (1<<i8x8) ) {
5841                     if( IS_8x8DCT(mb_type) ) {
5842                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5843                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5844                     } else {
5845                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5846                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5847                             const int index = 4*i8x8 + i4x4;
5848                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5849 //START_TIMER
5850                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5851 //STOP_TIMER("decode_residual")
5852                         }
5853                     }
5854                 } else {
5855                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5856                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5857                 }
5858             }
5859         }
5860
5861         if( cbp&0x30 ){
5862             int c;
5863             for( c = 0; c < 2; c++ ) {
5864                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5865                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5866             }
5867         }
5868
5869         if( cbp&0x20 ) {
5870             int c, i;
5871             for( c = 0; c < 2; c++ ) {
5872                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5873                 for( i = 0; i < 4; i++ ) {
5874                     const int index = 16 + 4 * c + i;
5875                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5876                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5877                 }
5878             }
5879         } else {
5880             uint8_t * const nnz= &h->non_zero_count_cache[0];
5881             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5882             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5883         }
5884     } else {
5885         uint8_t * const nnz= &h->non_zero_count_cache[0];
5886         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5887         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5888         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5889         h->last_qscale_diff = 0;
5890     }
5891
5892     s->current_picture.qscale_table[mb_xy]= s->qscale;
5893     write_back_non_zero_count(h);
5894
5895     if(MB_MBAFF){
5896         h->ref_count[0] >>= 1;
5897         h->ref_count[1] >>= 1;
5898     }
5899
5900     return 0;
5901 }
5902
5903
5904 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5905     int i, d;
5906     const int index_a = qp + h->slice_alpha_c0_offset;
5907     const int alpha = (alpha_table+52)[index_a];
5908     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5909
5910     if( bS[0] < 4 ) {
5911         int8_t tc[4];
5912         for(i=0; i<4; i++)
5913             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5914         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5915     } else {
5916         /* 16px edge length, because bS=4 is triggered by being at
5917          * the edge of an intra MB, so all 4 bS are the same */
5918             for( d = 0; d < 16; d++ ) {
5919                 const int p0 = pix[-1];
5920                 const int p1 = pix[-2];
5921                 const int p2 = pix[-3];
5922
5923                 const int q0 = pix[0];
5924                 const int q1 = pix[1];
5925                 const int q2 = pix[2];
5926
5927                 if( FFABS( p0 - q0 ) < alpha &&
5928                     FFABS( p1 - p0 ) < beta &&
5929                     FFABS( q1 - q0 ) < beta ) {
5930
5931                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5932                         if( FFABS( p2 - p0 ) < beta)
5933                         {
5934                             const int p3 = pix[-4];
5935                             /* p0', p1', p2' */
5936                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5937                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5938                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5939                         } else {
5940                             /* p0' */
5941                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5942                         }
5943                         if( FFABS( q2 - q0 ) < beta)
5944                         {
5945                             const int q3 = pix[3];
5946                             /* q0', q1', q2' */
5947                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5948                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5949                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5950                         } else {
5951                             /* q0' */
5952                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5953                         }
5954                     }else{
5955                         /* p0', q0' */
5956                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5957                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5958                     }
5959                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5960                 }
5961                 pix += stride;
5962             }
5963     }
5964 }
5965 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5966     int i;
5967     const int index_a = qp + h->slice_alpha_c0_offset;
5968     const int alpha = (alpha_table+52)[index_a];
5969     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5970
5971     if( bS[0] < 4 ) {
5972         int8_t tc[4];
5973         for(i=0; i<4; i++)
5974             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5975         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5976     } else {
5977         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5978     }
5979 }
5980
5981 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5982     int i;
5983     for( i = 0; i < 16; i++, pix += stride) {
5984         int index_a;
5985         int alpha;
5986         int beta;
5987
5988         int qp_index;
5989         int bS_index = (i >> 1);
5990         if (!MB_FIELD) {
5991             bS_index &= ~1;
5992             bS_index |= (i & 1);
5993         }
5994
5995         if( bS[bS_index] == 0 ) {
5996             continue;
5997         }
5998
5999         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6000         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6001         alpha = (alpha_table+52)[index_a];
6002         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6003
6004         if( bS[bS_index] < 4 ) {
6005             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6006             const int p0 = pix[-1];
6007             const int p1 = pix[-2];
6008             const int p2 = pix[-3];
6009             const int q0 = pix[0];
6010             const int q1 = pix[1];
6011             const int q2 = pix[2];
6012
6013             if( FFABS( p0 - q0 ) < alpha &&
6014                 FFABS( p1 - p0 ) < beta &&
6015                 FFABS( q1 - q0 ) < beta ) {
6016                 int tc = tc0;
6017                 int i_delta;
6018
6019                 if( FFABS( p2 - p0 ) < beta ) {
6020                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6021                     tc++;
6022                 }
6023                 if( FFABS( q2 - q0 ) < beta ) {
6024                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6025                     tc++;
6026                 }
6027
6028                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6029                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6030                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6031                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6032             }
6033         }else{
6034             const int p0 = pix[-1];
6035             const int p1 = pix[-2];
6036             const int p2 = pix[-3];
6037
6038             const int q0 = pix[0];
6039             const int q1 = pix[1];
6040             const int q2 = pix[2];
6041
6042             if( FFABS( p0 - q0 ) < alpha &&
6043                 FFABS( p1 - p0 ) < beta &&
6044                 FFABS( q1 - q0 ) < beta ) {
6045
6046                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6047                     if( FFABS( p2 - p0 ) < beta)
6048                     {
6049                         const int p3 = pix[-4];
6050                         /* p0', p1', p2' */
6051                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6052                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6053                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6054                     } else {
6055                         /* p0' */
6056                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6057                     }
6058                     if( FFABS( q2 - q0 ) < beta)
6059                     {
6060                         const int q3 = pix[3];
6061                         /* q0', q1', q2' */
6062                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6063                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6064                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6065                     } else {
6066                         /* q0' */
6067                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6068                     }
6069                 }else{
6070                     /* p0', q0' */
6071                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6072                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6073                 }
6074                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6075             }
6076         }
6077     }
6078 }
6079 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6080     int i;
6081     for( i = 0; i < 8; i++, pix += stride) {
6082         int index_a;
6083         int alpha;
6084         int beta;
6085
6086         int qp_index;
6087         int bS_index = i;
6088
6089         if( bS[bS_index] == 0 ) {
6090             continue;
6091         }
6092
6093         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6094         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6095         alpha = (alpha_table+52)[index_a];
6096         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6097
6098         if( bS[bS_index] < 4 ) {
6099             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6100             const int p0 = pix[-1];
6101             const int p1 = pix[-2];
6102             const int q0 = pix[0];
6103             const int q1 = pix[1];
6104
6105             if( FFABS( p0 - q0 ) < alpha &&
6106                 FFABS( p1 - p0 ) < beta &&
6107                 FFABS( q1 - q0 ) < beta ) {
6108                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6109
6110                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6111                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6112                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6113             }
6114         }else{
6115             const int p0 = pix[-1];
6116             const int p1 = pix[-2];
6117             const int q0 = pix[0];
6118             const int q1 = pix[1];
6119
6120             if( FFABS( p0 - q0 ) < alpha &&
6121                 FFABS( p1 - p0 ) < beta &&
6122                 FFABS( q1 - q0 ) < beta ) {
6123
6124                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6125                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6126                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6127             }
6128         }
6129     }
6130 }
6131
6132 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6133     int i, d;
6134     const int index_a = qp + h->slice_alpha_c0_offset;
6135     const int alpha = (alpha_table+52)[index_a];
6136     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6137     const int pix_next  = stride;
6138
6139     if( bS[0] < 4 ) {
6140         int8_t tc[4];
6141         for(i=0; i<4; i++)
6142             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6143         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6144     } else {
6145         /* 16px edge length, see filter_mb_edgev */
6146             for( d = 0; d < 16; d++ ) {
6147                 const int p0 = pix[-1*pix_next];
6148                 const int p1 = pix[-2*pix_next];
6149                 const int p2 = pix[-3*pix_next];
6150                 const int q0 = pix[0];
6151                 const int q1 = pix[1*pix_next];
6152                 const int q2 = pix[2*pix_next];
6153
6154                 if( FFABS( p0 - q0 ) < alpha &&
6155                     FFABS( p1 - p0 ) < beta &&
6156                     FFABS( q1 - q0 ) < beta ) {
6157
6158                     const int p3 = pix[-4*pix_next];
6159                     const int q3 = pix[ 3*pix_next];
6160
6161                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6162                         if( FFABS( p2 - p0 ) < beta) {
6163                             /* p0', p1', p2' */
6164                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6165                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6166                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6167                         } else {
6168                             /* p0' */
6169                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6170                         }
6171                         if( FFABS( q2 - q0 ) < beta) {
6172                             /* q0', q1', q2' */
6173                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6174                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6175                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6176                         } else {
6177                             /* q0' */
6178                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6179                         }
6180                     }else{
6181                         /* p0', q0' */
6182                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6183                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6184                     }
6185                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6186                 }
6187                 pix++;
6188             }
6189     }
6190 }
6191
6192 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6193     int i;
6194     const int index_a = qp + h->slice_alpha_c0_offset;
6195     const int alpha = (alpha_table+52)[index_a];
6196     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6197
6198     if( bS[0] < 4 ) {
6199         int8_t tc[4];
6200         for(i=0; i<4; i++)
6201             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6202         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6203     } else {
6204         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6205     }
6206 }
6207
6208 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6209     MpegEncContext * const s = &h->s;
6210     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6211     int mb_xy, mb_type;
6212     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6213
6214     mb_xy = h->mb_xy;
6215
6216     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6217 1 ||
6218        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6219                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6220         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6221         return;
6222     }
6223     assert(!FRAME_MBAFF);
6224
6225     mb_type = s->current_picture.mb_type[mb_xy];
6226     qp = s->current_picture.qscale_table[mb_xy];
6227     qp0 = s->current_picture.qscale_table[mb_xy-1];
6228     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6229     qpc = get_chroma_qp( h, 0, qp );
6230     qpc0 = get_chroma_qp( h, 0, qp0 );
6231     qpc1 = get_chroma_qp( h, 0, qp1 );
6232     qp0 = (qp + qp0 + 1) >> 1;
6233     qp1 = (qp + qp1 + 1) >> 1;
6234     qpc0 = (qpc + qpc0 + 1) >> 1;
6235     qpc1 = (qpc + qpc1 + 1) >> 1;
6236     qp_thresh = 15 - h->slice_alpha_c0_offset;
6237     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6238        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6239         return;
6240
6241     if( IS_INTRA(mb_type) ) {
6242         int16_t bS4[4] = {4,4,4,4};
6243         int16_t bS3[4] = {3,3,3,3};
6244         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6245         if( IS_8x8DCT(mb_type) ) {
6246             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6247             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6248             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6249             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6250         } else {
6251             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6252             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6253             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6254             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6255             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6256             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6257             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6258             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6259         }
6260         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6261         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6262         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6263         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6264         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6265         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6266         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6267         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6268         return;
6269     } else {
6270         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6271         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6272         int edges;
6273         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6274             edges = 4;
6275             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6276         } else {
6277             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6278                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6279             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6280                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6281                              ? 3 : 0;
6282             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6283             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6284             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6285                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6286         }
6287         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6288             bSv[0][0] = 0x0004000400040004ULL;
6289         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6290             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6291
6292 #define FILTER(hv,dir,edge)\
6293         if(bSv[dir][edge]) {\
6294             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6295             if(!(edge&1)) {\
6296                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6297                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6298             }\
6299         }
6300         if( edges == 1 ) {
6301             FILTER(v,0,0);
6302             FILTER(h,1,0);
6303         } else if( IS_8x8DCT(mb_type) ) {
6304             FILTER(v,0,0);
6305             FILTER(v,0,2);
6306             FILTER(h,1,0);
6307             FILTER(h,1,2);
6308         } else {
6309             FILTER(v,0,0);
6310             FILTER(v,0,1);
6311             FILTER(v,0,2);
6312             FILTER(v,0,3);
6313             FILTER(h,1,0);
6314             FILTER(h,1,1);
6315             FILTER(h,1,2);
6316             FILTER(h,1,3);
6317         }
6318 #undef FILTER
6319     }
6320 }
6321
6322 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6323     MpegEncContext * const s = &h->s;
6324     const int mb_xy= mb_x + mb_y*s->mb_stride;
6325     const int mb_type = s->current_picture.mb_type[mb_xy];
6326     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6327     int first_vertical_edge_done = 0;
6328     int dir;
6329
6330     //for sufficiently low qp, filtering wouldn't do anything
6331     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6332     if(!FRAME_MBAFF){
6333         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6334         int qp = s->current_picture.qscale_table[mb_xy];
6335         if(qp <= qp_thresh
6336            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6337            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6338             return;
6339         }
6340     }
6341
6342     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6343     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6344         int top_type, left_type[2];
6345         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6346         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6347         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6348
6349         if(IS_8x8DCT(top_type)){
6350             h->non_zero_count_cache[4+8*0]=
6351             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6352             h->non_zero_count_cache[6+8*0]=
6353             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6354         }
6355         if(IS_8x8DCT(left_type[0])){
6356             h->non_zero_count_cache[3+8*1]=
6357             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6358         }
6359         if(IS_8x8DCT(left_type[1])){
6360             h->non_zero_count_cache[3+8*3]=
6361             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6362         }
6363
6364         if(IS_8x8DCT(mb_type)){
6365             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6366             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6367
6368             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6369             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6370
6371             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6372             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6373
6374             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6375             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6376         }
6377     }
6378
6379     if (FRAME_MBAFF
6380             // left mb is in picture
6381             && h->slice_table[mb_xy-1] != 255
6382             // and current and left pair do not have the same interlaced type
6383             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6384             // and left mb is in the same slice if deblocking_filter == 2
6385             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6386         /* First vertical edge is different in MBAFF frames
6387          * There are 8 different bS to compute and 2 different Qp
6388          */
6389         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6390         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6391         int16_t bS[8];
6392         int qp[2];
6393         int bqp[2];
6394         int rqp[2];
6395         int mb_qp, mbn0_qp, mbn1_qp;
6396         int i;
6397         first_vertical_edge_done = 1;
6398
6399         if( IS_INTRA(mb_type) )
6400             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6401         else {
6402             for( i = 0; i < 8; i++ ) {
6403                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6404
6405                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6406                     bS[i] = 4;
6407                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6408                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6409                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6410                     bS[i] = 2;
6411                 else
6412                     bS[i] = 1;
6413             }
6414         }
6415
6416         mb_qp = s->current_picture.qscale_table[mb_xy];
6417         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6418         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6419         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6420         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6421                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6422         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6423                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6424         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6425         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6426                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6427         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6428                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6429
6430         /* Filter edge */
6431         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6432         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6433         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6434         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6435         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6436     }
6437     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6438     for( dir = 0; dir < 2; dir++ )
6439     {
6440         int edge;
6441         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6442         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6443         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6444         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6445         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6446
6447         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6448                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6449         // how often to recheck mv-based bS when iterating between edges
6450         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6451                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6452         // how often to recheck mv-based bS when iterating along each edge
6453         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6454
6455         if (first_vertical_edge_done) {
6456             start = 1;
6457             first_vertical_edge_done = 0;
6458         }
6459
6460         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6461             start = 1;
6462
6463         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6464             && !IS_INTERLACED(mb_type)
6465             && IS_INTERLACED(mbm_type)
6466             ) {
6467             // This is a special case in the norm where the filtering must
6468             // be done twice (one each of the field) even if we are in a
6469             // frame macroblock.
6470             //
6471             static const int nnz_idx[4] = {4,5,6,3};
6472             unsigned int tmp_linesize   = 2 *   linesize;
6473             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6474             int mbn_xy = mb_xy - 2 * s->mb_stride;
6475             int qp;
6476             int i, j;
6477             int16_t bS[4];
6478
6479             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6480                 if( IS_INTRA(mb_type) ||
6481                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6482                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6483                 } else {
6484                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6485                     for( i = 0; i < 4; i++ ) {
6486                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6487                             mbn_nnz[nnz_idx[i]] != 0 )
6488                             bS[i] = 2;
6489                         else
6490                             bS[i] = 1;
6491                     }
6492                 }
6493                 // Do not use s->qscale as luma quantizer because it has not the same
6494                 // value in IPCM macroblocks.
6495                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6496                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6497                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6498                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6499                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6500                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6501                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6502                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6503             }
6504
6505             start = 1;
6506         }
6507
6508         /* Calculate bS */
6509         for( edge = start; edge < edges; edge++ ) {
6510             /* mbn_xy: neighbor macroblock */
6511             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6512             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6513             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6514             int16_t bS[4];
6515             int qp;
6516
6517             if( (edge&1) && IS_8x8DCT(mb_type) )
6518                 continue;
6519
6520             if( IS_INTRA(mb_type) ||
6521                 IS_INTRA(mbn_type) ) {
6522                 int value;
6523                 if (edge == 0) {
6524                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6525                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6526                     ) {
6527                         value = 4;
6528                     } else {
6529                         value = 3;
6530                     }
6531                 } else {
6532                     value = 3;
6533                 }
6534                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6535             } else {
6536                 int i, l;
6537                 int mv_done;
6538
6539                 if( edge & mask_edge ) {
6540                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6541                     mv_done = 1;
6542                 }
6543                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6544                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6545                     mv_done = 1;
6546                 }
6547                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6548                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6549                     int bn_idx= b_idx - (dir ? 8:1);
6550                     int v = 0;
6551
6552                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6553                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6554                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6555                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6556                     }
6557
6558                     if(h->slice_type_nos == FF_B_TYPE && v){
6559                         v=0;
6560                         for( l = 0; !v && l < 2; l++ ) {
6561                             int ln= 1-l;
6562                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6563                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6564                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6565                         }
6566                     }
6567
6568                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6569                     mv_done = 1;
6570                 }
6571                 else
6572                     mv_done = 0;
6573
6574                 for( i = 0; i < 4; i++ ) {
6575                     int x = dir == 0 ? edge : i;
6576                     int y = dir == 0 ? i    : edge;
6577                     int b_idx= 8 + 4 + x + 8*y;
6578                     int bn_idx= b_idx - (dir ? 8:1);
6579
6580                     if( h->non_zero_count_cache[b_idx] != 0 ||
6581                         h->non_zero_count_cache[bn_idx] != 0 ) {
6582                         bS[i] = 2;
6583                     }
6584                     else if(!mv_done)
6585                     {
6586                         bS[i] = 0;
6587                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6588                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6589                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6590                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6591                                 bS[i] = 1;
6592                                 break;
6593                             }
6594                         }
6595
6596                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6597                             bS[i] = 0;
6598                             for( l = 0; l < 2; l++ ) {
6599                                 int ln= 1-l;
6600                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6601                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6602                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6603                                     bS[i] = 1;
6604                                     break;
6605                                 }
6606                             }
6607                         }
6608                     }
6609                 }
6610
6611                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6612                     continue;
6613             }
6614
6615             /* Filter edge */
6616             // Do not use s->qscale as luma quantizer because it has not the same
6617             // value in IPCM macroblocks.
6618             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6619             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6620             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6621             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6622             if( dir == 0 ) {
6623                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6624                 if( (edge&1) == 0 ) {
6625                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6626                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6627                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6628                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6629                 }
6630             } else {
6631                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6632                 if( (edge&1) == 0 ) {
6633                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6634                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6635                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6636                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6637                 }
6638             }
6639         }
6640     }
6641 }
6642
6643 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6644     MpegEncContext * const s = &h->s;
6645     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6646
6647     s->mb_skip_run= -1;
6648
6649     if( h->pps.cabac ) {
6650         int i;
6651
6652         /* realign */
6653         align_get_bits( &s->gb );
6654
6655         /* init cabac */
6656         ff_init_cabac_states( &h->cabac);
6657         ff_init_cabac_decoder( &h->cabac,
6658                                s->gb.buffer + get_bits_count(&s->gb)/8,
6659                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6660         /* calculate pre-state */
6661         for( i= 0; i < 460; i++ ) {
6662             int pre;
6663             if( h->slice_type_nos == FF_I_TYPE )
6664                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6665             else
6666                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6667
6668             if( pre <= 63 )
6669                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6670             else
6671                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6672         }
6673
6674         for(;;){
6675 //START_TIMER
6676             int ret = decode_mb_cabac(h);
6677             int eos;
6678 //STOP_TIMER("decode_mb_cabac")
6679
6680             if(ret>=0) hl_decode_mb(h);
6681
6682             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6683                 s->mb_y++;
6684
6685                 if(ret>=0) ret = decode_mb_cabac(h);
6686
6687                 if(ret>=0) hl_decode_mb(h);
6688                 s->mb_y--;
6689             }
6690             eos = get_cabac_terminate( &h->cabac );
6691
6692             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6693                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6694                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6695                 return -1;
6696             }
6697
6698             if( ++s->mb_x >= s->mb_width ) {
6699                 s->mb_x = 0;
6700                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6701                 ++s->mb_y;
6702                 if(FIELD_OR_MBAFF_PICTURE) {
6703                     ++s->mb_y;
6704                 }
6705             }
6706
6707             if( eos || s->mb_y >= s->mb_height ) {
6708                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6709                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6710                 return 0;
6711             }
6712         }
6713
6714     } else {
6715         for(;;){
6716             int ret = decode_mb_cavlc(h);
6717
6718             if(ret>=0) hl_decode_mb(h);
6719
6720             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6721                 s->mb_y++;
6722                 ret = decode_mb_cavlc(h);
6723
6724                 if(ret>=0) hl_decode_mb(h);
6725                 s->mb_y--;
6726             }
6727
6728             if(ret<0){
6729                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6730                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6731
6732                 return -1;
6733             }
6734
6735             if(++s->mb_x >= s->mb_width){
6736                 s->mb_x=0;
6737                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6738                 ++s->mb_y;
6739                 if(FIELD_OR_MBAFF_PICTURE) {
6740                     ++s->mb_y;
6741                 }
6742                 if(s->mb_y >= s->mb_height){
6743                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6744
6745                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6746                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6747
6748                         return 0;
6749                     }else{
6750                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6751
6752                         return -1;
6753                     }
6754                 }
6755             }
6756
6757             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6758                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6759                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6760                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6761
6762                     return 0;
6763                 }else{
6764                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6765
6766                     return -1;
6767                 }
6768             }
6769         }
6770     }
6771
6772 #if 0
6773     for(;s->mb_y < s->mb_height; s->mb_y++){
6774         for(;s->mb_x < s->mb_width; s->mb_x++){
6775             int ret= decode_mb(h);
6776
6777             hl_decode_mb(h);
6778
6779             if(ret<0){
6780                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6781                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6782
6783                 return -1;
6784             }
6785
6786             if(++s->mb_x >= s->mb_width){
6787                 s->mb_x=0;
6788                 if(++s->mb_y >= s->mb_height){
6789                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6790                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6791
6792                         return 0;
6793                     }else{
6794                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6795
6796                         return -1;
6797                     }
6798                 }
6799             }
6800
6801             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6802                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6803                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6804
6805                     return 0;
6806                 }else{
6807                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6808
6809                     return -1;
6810                 }
6811             }
6812         }
6813         s->mb_x=0;
6814         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6815     }
6816 #endif
6817     return -1; //not reached
6818 }
6819
6820 static int decode_unregistered_user_data(H264Context *h, int size){
6821     MpegEncContext * const s = &h->s;
6822     uint8_t user_data[16+256];
6823     int e, build, i;
6824
6825     if(size<16)
6826         return -1;
6827
6828     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6829         user_data[i]= get_bits(&s->gb, 8);
6830     }
6831
6832     user_data[i]= 0;
6833     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6834     if(e==1 && build>=0)
6835         h->x264_build= build;
6836
6837     if(s->avctx->debug & FF_DEBUG_BUGS)
6838         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6839
6840     for(; i<size; i++)
6841         skip_bits(&s->gb, 8);
6842
6843     return 0;
6844 }
6845
6846 static int decode_sei(H264Context *h){
6847     MpegEncContext * const s = &h->s;
6848
6849     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6850         int size, type;
6851
6852         type=0;
6853         do{
6854             type+= show_bits(&s->gb, 8);
6855         }while(get_bits(&s->gb, 8) == 255);
6856
6857         size=0;
6858         do{
6859             size+= show_bits(&s->gb, 8);
6860         }while(get_bits(&s->gb, 8) == 255);
6861
6862         switch(type){
6863         case 5:
6864             if(decode_unregistered_user_data(h, size) < 0)
6865                 return -1;
6866             break;
6867         default:
6868             skip_bits(&s->gb, 8*size);
6869         }
6870
6871         //FIXME check bits here
6872         align_get_bits(&s->gb);
6873     }
6874
6875     return 0;
6876 }
6877
6878 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6879     MpegEncContext * const s = &h->s;
6880     int cpb_count, i;
6881     cpb_count = get_ue_golomb(&s->gb) + 1;
6882     get_bits(&s->gb, 4); /* bit_rate_scale */
6883     get_bits(&s->gb, 4); /* cpb_size_scale */
6884     for(i=0; i<cpb_count; i++){
6885         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6886         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6887         get_bits1(&s->gb);     /* cbr_flag */
6888     }
6889     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6890     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6891     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6892     get_bits(&s->gb, 5); /* time_offset_length */
6893 }
6894
6895 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6896     MpegEncContext * const s = &h->s;
6897     int aspect_ratio_info_present_flag;
6898     unsigned int aspect_ratio_idc;
6899     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6900
6901     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6902
6903     if( aspect_ratio_info_present_flag ) {
6904         aspect_ratio_idc= get_bits(&s->gb, 8);
6905         if( aspect_ratio_idc == EXTENDED_SAR ) {
6906             sps->sar.num= get_bits(&s->gb, 16);
6907             sps->sar.den= get_bits(&s->gb, 16);
6908         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6909             sps->sar=  pixel_aspect[aspect_ratio_idc];
6910         }else{
6911             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6912             return -1;
6913         }
6914     }else{
6915         sps->sar.num=
6916         sps->sar.den= 0;
6917     }
6918 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6919
6920     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6921         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6922     }
6923
6924     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6925         get_bits(&s->gb, 3);    /* video_format */
6926         get_bits1(&s->gb);      /* video_full_range_flag */
6927         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6928             get_bits(&s->gb, 8); /* colour_primaries */
6929             get_bits(&s->gb, 8); /* transfer_characteristics */
6930             get_bits(&s->gb, 8); /* matrix_coefficients */
6931         }
6932     }
6933
6934     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6935         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6936         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6937     }
6938
6939     sps->timing_info_present_flag = get_bits1(&s->gb);
6940     if(sps->timing_info_present_flag){
6941         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6942         sps->time_scale = get_bits_long(&s->gb, 32);
6943         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6944     }
6945
6946     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6947     if(nal_hrd_parameters_present_flag)
6948         decode_hrd_parameters(h, sps);
6949     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6950     if(vcl_hrd_parameters_present_flag)
6951         decode_hrd_parameters(h, sps);
6952     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6953         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6954     get_bits1(&s->gb);         /* pic_struct_present_flag */
6955
6956     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6957     if(sps->bitstream_restriction_flag){
6958         unsigned int num_reorder_frames;
6959         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6960         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6961         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6962         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6963         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6964         num_reorder_frames= get_ue_golomb(&s->gb);
6965         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6966
6967         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6968             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6969             return -1;
6970         }
6971
6972         sps->num_reorder_frames= num_reorder_frames;
6973     }
6974
6975     return 0;
6976 }
6977
6978 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6979                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6980     MpegEncContext * const s = &h->s;
6981     int i, last = 8, next = 8;
6982     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6983     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6984         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6985     else
6986     for(i=0;i<size;i++){
6987         if(next)
6988             next = (last + get_se_golomb(&s->gb)) & 0xff;
6989         if(!i && !next){ /* matrix not written, we use the preset one */
6990             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6991             break;
6992         }
6993         last = factors[scan[i]] = next ? next : last;
6994     }
6995 }
6996
6997 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6998                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6999     MpegEncContext * const s = &h->s;
7000     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7001     const uint8_t *fallback[4] = {
7002         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7003         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7004         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7005         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7006     };
7007     if(get_bits1(&s->gb)){
7008         sps->scaling_matrix_present |= is_sps;
7009         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7010         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7011         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7012         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7013         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7014         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7015         if(is_sps || pps->transform_8x8_mode){
7016             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7017             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7018         }
7019     } else if(fallback_sps) {
7020         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7021         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7022     }
7023 }
7024
7025 /**
7026  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7027  */
7028 static void *
7029 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7030                     const size_t size, const char *name)
7031 {
7032     if(id>=max) {
7033         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7034         return NULL;
7035     }
7036
7037     if(!vec[id]) {
7038         vec[id] = av_mallocz(size);
7039         if(vec[id] == NULL)
7040             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7041     }
7042     return vec[id];
7043 }
7044
7045 static inline int decode_seq_parameter_set(H264Context *h){
7046     MpegEncContext * const s = &h->s;
7047     int profile_idc, level_idc;
7048     unsigned int sps_id, tmp, mb_width, mb_height;
7049     int i;
7050     SPS *sps;
7051
7052     profile_idc= get_bits(&s->gb, 8);
7053     get_bits1(&s->gb);   //constraint_set0_flag
7054     get_bits1(&s->gb);   //constraint_set1_flag
7055     get_bits1(&s->gb);   //constraint_set2_flag
7056     get_bits1(&s->gb);   //constraint_set3_flag
7057     get_bits(&s->gb, 4); // reserved
7058     level_idc= get_bits(&s->gb, 8);
7059     sps_id= get_ue_golomb(&s->gb);
7060
7061     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7062     if(sps == NULL)
7063         return -1;
7064
7065     sps->profile_idc= profile_idc;
7066     sps->level_idc= level_idc;
7067
7068     if(sps->profile_idc >= 100){ //high profile
7069         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7070         if(sps->chroma_format_idc == 3)
7071             get_bits1(&s->gb);  //residual_color_transform_flag
7072         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7073         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7074         sps->transform_bypass = get_bits1(&s->gb);
7075         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7076     }else{
7077         sps->scaling_matrix_present = 0;
7078         sps->chroma_format_idc= 1;
7079     }
7080
7081     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7082     sps->poc_type= get_ue_golomb(&s->gb);
7083
7084     if(sps->poc_type == 0){ //FIXME #define
7085         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7086     } else if(sps->poc_type == 1){//FIXME #define
7087         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7088         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7089         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7090         tmp= get_ue_golomb(&s->gb);
7091
7092         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7093             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7094             return -1;
7095         }
7096         sps->poc_cycle_length= tmp;
7097
7098         for(i=0; i<sps->poc_cycle_length; i++)
7099             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7100     }else if(sps->poc_type != 2){
7101         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7102         return -1;
7103     }
7104
7105     tmp= get_ue_golomb(&s->gb);
7106     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7107         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7108         return -1;
7109     }
7110     sps->ref_frame_count= tmp;
7111     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7112     mb_width= get_ue_golomb(&s->gb) + 1;
7113     mb_height= get_ue_golomb(&s->gb) + 1;
7114     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7115        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7116         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7117         return -1;
7118     }
7119     sps->mb_width = mb_width;
7120     sps->mb_height= mb_height;
7121
7122     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7123     if(!sps->frame_mbs_only_flag)
7124         sps->mb_aff= get_bits1(&s->gb);
7125     else
7126         sps->mb_aff= 0;
7127
7128     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7129
7130 #ifndef ALLOW_INTERLACE
7131     if(sps->mb_aff)
7132         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7133 #endif
7134     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7135         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7136
7137     sps->crop= get_bits1(&s->gb);
7138     if(sps->crop){
7139         sps->crop_left  = get_ue_golomb(&s->gb);
7140         sps->crop_right = get_ue_golomb(&s->gb);
7141         sps->crop_top   = get_ue_golomb(&s->gb);
7142         sps->crop_bottom= get_ue_golomb(&s->gb);
7143         if(sps->crop_left || sps->crop_top){
7144             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7145         }
7146         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7147             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7148         }
7149     }else{
7150         sps->crop_left  =
7151         sps->crop_right =
7152         sps->crop_top   =
7153         sps->crop_bottom= 0;
7154     }
7155
7156     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7157     if( sps->vui_parameters_present_flag )
7158         decode_vui_parameters(h, sps);
7159
7160     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7161         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7162                sps_id, sps->profile_idc, sps->level_idc,
7163                sps->poc_type,
7164                sps->ref_frame_count,
7165                sps->mb_width, sps->mb_height,
7166                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7167                sps->direct_8x8_inference_flag ? "8B8" : "",
7168                sps->crop_left, sps->crop_right,
7169                sps->crop_top, sps->crop_bottom,
7170                sps->vui_parameters_present_flag ? "VUI" : "",
7171                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7172                );
7173     }
7174     return 0;
7175 }
7176
7177 static void
7178 build_qp_table(PPS *pps, int t, int index)
7179 {
7180     int i;
7181     for(i = 0; i < 52; i++)
7182         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7183 }
7184
7185 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7186     MpegEncContext * const s = &h->s;
7187     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7188     PPS *pps;
7189
7190     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7191     if(pps == NULL)
7192         return -1;
7193
7194     tmp= get_ue_golomb(&s->gb);
7195     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7196         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7197         return -1;
7198     }
7199     pps->sps_id= tmp;
7200
7201     pps->cabac= get_bits1(&s->gb);
7202     pps->pic_order_present= get_bits1(&s->gb);
7203     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7204     if(pps->slice_group_count > 1 ){
7205         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7206         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7207         switch(pps->mb_slice_group_map_type){
7208         case 0:
7209 #if 0
7210 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7211 |    run_length[ i ]                                |1  |ue(v)   |
7212 #endif
7213             break;
7214         case 2:
7215 #if 0
7216 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7217 |{                                                  |   |        |
7218 |    top_left_mb[ i ]                               |1  |ue(v)   |
7219 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7220 |   }                                               |   |        |
7221 #endif
7222             break;
7223         case 3:
7224         case 4:
7225         case 5:
7226 #if 0
7227 |   slice_group_change_direction_flag               |1  |u(1)    |
7228 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7229 #endif
7230             break;
7231         case 6:
7232 #if 0
7233 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7234 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7235 |)                                                  |   |        |
7236 |    slice_group_id[ i ]                            |1  |u(v)    |
7237 #endif
7238             break;
7239         }
7240     }
7241     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7242     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7243     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7244         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7245         pps->ref_count[0]= pps->ref_count[1]= 1;
7246         return -1;
7247     }
7248
7249     pps->weighted_pred= get_bits1(&s->gb);
7250     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7251     pps->init_qp= get_se_golomb(&s->gb) + 26;
7252     pps->init_qs= get_se_golomb(&s->gb) + 26;
7253     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7254     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7255     pps->constrained_intra_pred= get_bits1(&s->gb);
7256     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7257
7258     pps->transform_8x8_mode= 0;
7259     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7260     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7261     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7262
7263     if(get_bits_count(&s->gb) < bit_length){
7264         pps->transform_8x8_mode= get_bits1(&s->gb);
7265         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7266         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7267     } else {
7268         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7269     }
7270
7271     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7272     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7273     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7274         h->pps.chroma_qp_diff= 1;
7275
7276     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7277         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7278                pps_id, pps->sps_id,
7279                pps->cabac ? "CABAC" : "CAVLC",
7280                pps->slice_group_count,
7281                pps->ref_count[0], pps->ref_count[1],
7282                pps->weighted_pred ? "weighted" : "",
7283                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7284                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7285                pps->constrained_intra_pred ? "CONSTR" : "",
7286                pps->redundant_pic_cnt_present ? "REDU" : "",
7287                pps->transform_8x8_mode ? "8x8DCT" : ""
7288                );
7289     }
7290
7291     return 0;
7292 }
7293
7294 /**
7295  * Call decode_slice() for each context.
7296  *
7297  * @param h h264 master context
7298  * @param context_count number of contexts to execute
7299  */
7300 static void execute_decode_slices(H264Context *h, int context_count){
7301     MpegEncContext * const s = &h->s;
7302     AVCodecContext * const avctx= s->avctx;
7303     H264Context *hx;
7304     int i;
7305
7306     if(context_count == 1) {
7307         decode_slice(avctx, h);
7308     } else {
7309         for(i = 1; i < context_count; i++) {
7310             hx = h->thread_context[i];
7311             hx->s.error_resilience = avctx->error_resilience;
7312             hx->s.error_count = 0;
7313         }
7314
7315         avctx->execute(avctx, (void *)decode_slice,
7316                        (void **)h->thread_context, NULL, context_count);
7317
7318         /* pull back stuff from slices to master context */
7319         hx = h->thread_context[context_count - 1];
7320         s->mb_x = hx->s.mb_x;
7321         s->mb_y = hx->s.mb_y;
7322         s->dropable = hx->s.dropable;
7323         s->picture_structure = hx->s.picture_structure;
7324         for(i = 1; i < context_count; i++)
7325             h->s.error_count += h->thread_context[i]->s.error_count;
7326     }
7327 }
7328
7329
7330 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7331     MpegEncContext * const s = &h->s;
7332     AVCodecContext * const avctx= s->avctx;
7333     int buf_index=0;
7334     H264Context *hx; ///< thread context
7335     int context_count = 0;
7336
7337     h->max_contexts = avctx->thread_count;
7338 #if 0
7339     int i;
7340     for(i=0; i<50; i++){
7341         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7342     }
7343 #endif
7344     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7345         h->current_slice = 0;
7346         if (!s->first_field)
7347             s->current_picture_ptr= NULL;
7348     }
7349
7350     for(;;){
7351         int consumed;
7352         int dst_length;
7353         int bit_length;
7354         const uint8_t *ptr;
7355         int i, nalsize = 0;
7356         int err;
7357
7358         if(h->is_avc) {
7359             if(buf_index >= buf_size) break;
7360             nalsize = 0;
7361             for(i = 0; i < h->nal_length_size; i++)
7362                 nalsize = (nalsize << 8) | buf[buf_index++];
7363             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7364                 if(nalsize == 1){
7365                     buf_index++;
7366                     continue;
7367                 }else{
7368                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7369                     break;
7370                 }
7371             }
7372         } else {
7373             // start code prefix search
7374             for(; buf_index + 3 < buf_size; buf_index++){
7375                 // This should always succeed in the first iteration.
7376                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7377                     break;
7378             }
7379
7380             if(buf_index+3 >= buf_size) break;
7381
7382             buf_index+=3;
7383         }
7384
7385         hx = h->thread_context[context_count];
7386
7387         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7388         if (ptr==NULL || dst_length < 0){
7389             return -1;
7390         }
7391         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7392             dst_length--;
7393         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7394
7395         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7396             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7397         }
7398
7399         if (h->is_avc && (nalsize != consumed)){
7400             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7401             consumed= nalsize;
7402         }
7403
7404         buf_index += consumed;
7405
7406         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7407            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7408             continue;
7409
7410       again:
7411         err = 0;
7412         switch(hx->nal_unit_type){
7413         case NAL_IDR_SLICE:
7414             if (h->nal_unit_type != NAL_IDR_SLICE) {
7415                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7416                 return -1;
7417             }
7418             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7419         case NAL_SLICE:
7420             init_get_bits(&hx->s.gb, ptr, bit_length);
7421             hx->intra_gb_ptr=
7422             hx->inter_gb_ptr= &hx->s.gb;
7423             hx->s.data_partitioning = 0;
7424
7425             if((err = decode_slice_header(hx, h)))
7426                break;
7427
7428             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7429             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7430                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7431                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7432                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7433                && avctx->skip_frame < AVDISCARD_ALL)
7434                 context_count++;
7435             break;
7436         case NAL_DPA:
7437             init_get_bits(&hx->s.gb, ptr, bit_length);
7438             hx->intra_gb_ptr=
7439             hx->inter_gb_ptr= NULL;
7440             hx->s.data_partitioning = 1;
7441
7442             err = decode_slice_header(hx, h);
7443             break;
7444         case NAL_DPB:
7445             init_get_bits(&hx->intra_gb, ptr, bit_length);
7446             hx->intra_gb_ptr= &hx->intra_gb;
7447             break;
7448         case NAL_DPC:
7449             init_get_bits(&hx->inter_gb, ptr, bit_length);
7450             hx->inter_gb_ptr= &hx->inter_gb;
7451
7452             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7453                && s->context_initialized
7454                && s->hurry_up < 5
7455                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7456                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7457                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7458                && avctx->skip_frame < AVDISCARD_ALL)
7459                 context_count++;
7460             break;
7461         case NAL_SEI:
7462             init_get_bits(&s->gb, ptr, bit_length);
7463             decode_sei(h);
7464             break;
7465         case NAL_SPS:
7466             init_get_bits(&s->gb, ptr, bit_length);
7467             decode_seq_parameter_set(h);
7468
7469             if(s->flags& CODEC_FLAG_LOW_DELAY)
7470                 s->low_delay=1;
7471
7472             if(avctx->has_b_frames < 2)
7473                 avctx->has_b_frames= !s->low_delay;
7474             break;
7475         case NAL_PPS:
7476             init_get_bits(&s->gb, ptr, bit_length);
7477
7478             decode_picture_parameter_set(h, bit_length);
7479
7480             break;
7481         case NAL_AUD:
7482         case NAL_END_SEQUENCE:
7483         case NAL_END_STREAM:
7484         case NAL_FILLER_DATA:
7485         case NAL_SPS_EXT:
7486         case NAL_AUXILIARY_SLICE:
7487             break;
7488         default:
7489             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7490         }
7491
7492         if(context_count == h->max_contexts) {
7493             execute_decode_slices(h, context_count);
7494             context_count = 0;
7495         }
7496
7497         if (err < 0)
7498             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7499         else if(err == 1) {
7500             /* Slice could not be decoded in parallel mode, copy down
7501              * NAL unit stuff to context 0 and restart. Note that
7502              * rbsp_buffer is not transferred, but since we no longer
7503              * run in parallel mode this should not be an issue. */
7504             h->nal_unit_type = hx->nal_unit_type;
7505             h->nal_ref_idc   = hx->nal_ref_idc;
7506             hx = h;
7507             goto again;
7508         }
7509     }
7510     if(context_count)
7511         execute_decode_slices(h, context_count);
7512     return buf_index;
7513 }
7514
7515 /**
7516  * returns the number of bytes consumed for building the current frame
7517  */
7518 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7519         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7520         if(pos+10>buf_size) pos=buf_size; // oops ;)
7521
7522         return pos;
7523 }
7524
7525 static int decode_frame(AVCodecContext *avctx,
7526                              void *data, int *data_size,
7527                              const uint8_t *buf, int buf_size)
7528 {
7529     H264Context *h = avctx->priv_data;
7530     MpegEncContext *s = &h->s;
7531     AVFrame *pict = data;
7532     int buf_index;
7533
7534     s->flags= avctx->flags;
7535     s->flags2= avctx->flags2;
7536
7537    /* end of stream, output what is still in the buffers */
7538     if (buf_size == 0) {
7539         Picture *out;
7540         int i, out_idx;
7541
7542 //FIXME factorize this with the output code below
7543         out = h->delayed_pic[0];
7544         out_idx = 0;
7545         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7546             if(h->delayed_pic[i]->poc < out->poc){
7547                 out = h->delayed_pic[i];
7548                 out_idx = i;
7549             }
7550
7551         for(i=out_idx; h->delayed_pic[i]; i++)
7552             h->delayed_pic[i] = h->delayed_pic[i+1];
7553
7554         if(out){
7555             *data_size = sizeof(AVFrame);
7556             *pict= *(AVFrame*)out;
7557         }
7558
7559         return 0;
7560     }
7561
7562     if(h->is_avc && !h->got_avcC) {
7563         int i, cnt, nalsize;
7564         unsigned char *p = avctx->extradata;
7565         if(avctx->extradata_size < 7) {
7566             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7567             return -1;
7568         }
7569         if(*p != 1) {
7570             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7571             return -1;
7572         }
7573         /* sps and pps in the avcC always have length coded with 2 bytes,
7574            so put a fake nal_length_size = 2 while parsing them */
7575         h->nal_length_size = 2;
7576         // Decode sps from avcC
7577         cnt = *(p+5) & 0x1f; // Number of sps
7578         p += 6;
7579         for (i = 0; i < cnt; i++) {
7580             nalsize = AV_RB16(p) + 2;
7581             if(decode_nal_units(h, p, nalsize) < 0) {
7582                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7583                 return -1;
7584             }
7585             p += nalsize;
7586         }
7587         // Decode pps from avcC
7588         cnt = *(p++); // Number of pps
7589         for (i = 0; i < cnt; i++) {
7590             nalsize = AV_RB16(p) + 2;
7591             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7592                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7593                 return -1;
7594             }
7595             p += nalsize;
7596         }
7597         // Now store right nal length size, that will be use to parse all other nals
7598         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7599         // Do not reparse avcC
7600         h->got_avcC = 1;
7601     }
7602
7603     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7604         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7605             return -1;
7606     }
7607
7608     buf_index=decode_nal_units(h, buf, buf_size);
7609     if(buf_index < 0)
7610         return -1;
7611
7612     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7613         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7614         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7615         return -1;
7616     }
7617
7618     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7619         Picture *out = s->current_picture_ptr;
7620         Picture *cur = s->current_picture_ptr;
7621         int i, pics, cross_idr, out_of_order, out_idx;
7622
7623         s->mb_y= 0;
7624
7625         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7626         s->current_picture_ptr->pict_type= s->pict_type;
7627
7628         if(!s->dropable) {
7629             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7630             h->prev_poc_msb= h->poc_msb;
7631             h->prev_poc_lsb= h->poc_lsb;
7632         }
7633         h->prev_frame_num_offset= h->frame_num_offset;
7634         h->prev_frame_num= h->frame_num;
7635
7636         /*
7637          * FIXME: Error handling code does not seem to support interlaced
7638          * when slices span multiple rows
7639          * The ff_er_add_slice calls don't work right for bottom
7640          * fields; they cause massive erroneous error concealing
7641          * Error marking covers both fields (top and bottom).
7642          * This causes a mismatched s->error_count
7643          * and a bad error table. Further, the error count goes to
7644          * INT_MAX when called for bottom field, because mb_y is
7645          * past end by one (callers fault) and resync_mb_y != 0
7646          * causes problems for the first MB line, too.
7647          */
7648         if (!FIELD_PICTURE)
7649             ff_er_frame_end(s);
7650
7651         MPV_frame_end(s);
7652
7653         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7654             /* Wait for second field. */
7655             *data_size = 0;
7656
7657         } else {
7658             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7659             /* Derive top_field_first from field pocs. */
7660             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7661
7662         //FIXME do something with unavailable reference frames
7663
7664             /* Sort B-frames into display order */
7665
7666             if(h->sps.bitstream_restriction_flag
7667                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7668                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7669                 s->low_delay = 0;
7670             }
7671
7672             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7673                && !h->sps.bitstream_restriction_flag){
7674                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7675                 s->low_delay= 0;
7676             }
7677
7678             pics = 0;
7679             while(h->delayed_pic[pics]) pics++;
7680
7681             assert(pics <= MAX_DELAYED_PIC_COUNT);
7682
7683             h->delayed_pic[pics++] = cur;
7684             if(cur->reference == 0)
7685                 cur->reference = DELAYED_PIC_REF;
7686
7687             out = h->delayed_pic[0];
7688             out_idx = 0;
7689             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7690                 if(h->delayed_pic[i]->poc < out->poc){
7691                     out = h->delayed_pic[i];
7692                     out_idx = i;
7693                 }
7694             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7695
7696             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7697
7698             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7699                 { }
7700             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7701                || (s->low_delay &&
7702                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7703                  || cur->pict_type == FF_B_TYPE)))
7704             {
7705                 s->low_delay = 0;
7706                 s->avctx->has_b_frames++;
7707             }
7708
7709             if(out_of_order || pics > s->avctx->has_b_frames){
7710                 out->reference &= ~DELAYED_PIC_REF;
7711                 for(i=out_idx; h->delayed_pic[i]; i++)
7712                     h->delayed_pic[i] = h->delayed_pic[i+1];
7713             }
7714             if(!out_of_order && pics > s->avctx->has_b_frames){
7715                 *data_size = sizeof(AVFrame);
7716
7717                 h->outputed_poc = out->poc;
7718                 *pict= *(AVFrame*)out;
7719             }else{
7720                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7721             }
7722         }
7723     }
7724
7725     assert(pict->data[0] || !*data_size);
7726     ff_print_debug_info(s, pict);
7727 //printf("out %d\n", (int)pict->data[0]);
7728 #if 0 //?
7729
7730     /* Return the Picture timestamp as the frame number */
7731     /* we subtract 1 because it is added on utils.c     */
7732     avctx->frame_number = s->picture_number - 1;
7733 #endif
7734     return get_consumed_bytes(s, buf_index, buf_size);
7735 }
7736 #if 0
7737 static inline void fill_mb_avail(H264Context *h){
7738     MpegEncContext * const s = &h->s;
7739     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7740
7741     if(s->mb_y){
7742         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7743         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7744         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7745     }else{
7746         h->mb_avail[0]=
7747         h->mb_avail[1]=
7748         h->mb_avail[2]= 0;
7749     }
7750     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7751     h->mb_avail[4]= 1; //FIXME move out
7752     h->mb_avail[5]= 0; //FIXME move out
7753 }
7754 #endif
7755
7756 #ifdef TEST
7757 #undef printf
7758 #undef random
7759 #define COUNT 8000
7760 #define SIZE (COUNT*40)
7761 int main(void){
7762     int i;
7763     uint8_t temp[SIZE];
7764     PutBitContext pb;
7765     GetBitContext gb;
7766 //    int int_temp[10000];
7767     DSPContext dsp;
7768     AVCodecContext avctx;
7769
7770     dsputil_init(&dsp, &avctx);
7771
7772     init_put_bits(&pb, temp, SIZE);
7773     printf("testing unsigned exp golomb\n");
7774     for(i=0; i<COUNT; i++){
7775         START_TIMER
7776         set_ue_golomb(&pb, i);
7777         STOP_TIMER("set_ue_golomb");
7778     }
7779     flush_put_bits(&pb);
7780
7781     init_get_bits(&gb, temp, 8*SIZE);
7782     for(i=0; i<COUNT; i++){
7783         int j, s;
7784
7785         s= show_bits(&gb, 24);
7786
7787         START_TIMER
7788         j= get_ue_golomb(&gb);
7789         if(j != i){
7790             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7791 //            return -1;
7792         }
7793         STOP_TIMER("get_ue_golomb");
7794     }
7795
7796
7797     init_put_bits(&pb, temp, SIZE);
7798     printf("testing signed exp golomb\n");
7799     for(i=0; i<COUNT; i++){
7800         START_TIMER
7801         set_se_golomb(&pb, i - COUNT/2);
7802         STOP_TIMER("set_se_golomb");
7803     }
7804     flush_put_bits(&pb);
7805
7806     init_get_bits(&gb, temp, 8*SIZE);
7807     for(i=0; i<COUNT; i++){
7808         int j, s;
7809
7810         s= show_bits(&gb, 24);
7811
7812         START_TIMER
7813         j= get_se_golomb(&gb);
7814         if(j != i - COUNT/2){
7815             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7816 //            return -1;
7817         }
7818         STOP_TIMER("get_se_golomb");
7819     }
7820
7821 #if 0
7822     printf("testing 4x4 (I)DCT\n");
7823
7824     DCTELEM block[16];
7825     uint8_t src[16], ref[16];
7826     uint64_t error= 0, max_error=0;
7827
7828     for(i=0; i<COUNT; i++){
7829         int j;
7830 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7831         for(j=0; j<16; j++){
7832             ref[j]= random()%255;
7833             src[j]= random()%255;
7834         }
7835
7836         h264_diff_dct_c(block, src, ref, 4);
7837
7838         //normalize
7839         for(j=0; j<16; j++){
7840 //            printf("%d ", block[j]);
7841             block[j]= block[j]*4;
7842             if(j&1) block[j]= (block[j]*4 + 2)/5;
7843             if(j&4) block[j]= (block[j]*4 + 2)/5;
7844         }
7845 //        printf("\n");
7846
7847         s->dsp.h264_idct_add(ref, block, 4);
7848 /*        for(j=0; j<16; j++){
7849             printf("%d ", ref[j]);
7850         }
7851         printf("\n");*/
7852
7853         for(j=0; j<16; j++){
7854             int diff= FFABS(src[j] - ref[j]);
7855
7856             error+= diff*diff;
7857             max_error= FFMAX(max_error, diff);
7858         }
7859     }
7860     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7861     printf("testing quantizer\n");
7862     for(qp=0; qp<52; qp++){
7863         for(i=0; i<16; i++)
7864             src1_block[i]= src2_block[i]= random()%255;
7865
7866     }
7867     printf("Testing NAL layer\n");
7868
7869     uint8_t bitstream[COUNT];
7870     uint8_t nal[COUNT*2];
7871     H264Context h;
7872     memset(&h, 0, sizeof(H264Context));
7873
7874     for(i=0; i<COUNT; i++){
7875         int zeros= i;
7876         int nal_length;
7877         int consumed;
7878         int out_length;
7879         uint8_t *out;
7880         int j;
7881
7882         for(j=0; j<COUNT; j++){
7883             bitstream[j]= (random() % 255) + 1;
7884         }
7885
7886         for(j=0; j<zeros; j++){
7887             int pos= random() % COUNT;
7888             while(bitstream[pos] == 0){
7889                 pos++;
7890                 pos %= COUNT;
7891             }
7892             bitstream[pos]=0;
7893         }
7894
7895         START_TIMER
7896
7897         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7898         if(nal_length<0){
7899             printf("encoding failed\n");
7900             return -1;
7901         }
7902
7903         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7904
7905         STOP_TIMER("NAL")
7906
7907         if(out_length != COUNT){
7908             printf("incorrect length %d %d\n", out_length, COUNT);
7909             return -1;
7910         }
7911
7912         if(consumed != nal_length){
7913             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7914             return -1;
7915         }
7916
7917         if(memcmp(bitstream, out, COUNT)){
7918             printf("mismatch\n");
7919             return -1;
7920         }
7921     }
7922 #endif
7923
7924     printf("Testing RBSP\n");
7925
7926
7927     return 0;
7928 }
7929 #endif /* TEST */
7930
7931
7932 static av_cold int decode_end(AVCodecContext *avctx)
7933 {
7934     H264Context *h = avctx->priv_data;
7935     MpegEncContext *s = &h->s;
7936
7937     av_freep(&h->rbsp_buffer[0]);
7938     av_freep(&h->rbsp_buffer[1]);
7939     free_tables(h); //FIXME cleanup init stuff perhaps
7940     MPV_common_end(s);
7941
7942 //    memset(h, 0, sizeof(H264Context));
7943
7944     return 0;
7945 }
7946
7947
7948 AVCodec h264_decoder = {
7949     "h264",
7950     CODEC_TYPE_VIDEO,
7951     CODEC_ID_H264,
7952     sizeof(H264Context),
7953     decode_init,
7954     NULL,
7955     decode_end,
7956     decode_frame,
7957     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7958     .flush= flush_dpb,
7959     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7960 };
7961
7962 #include "svq3.c"