git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     MpegEncContext * const s = &h->s;
 554     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 555
 556     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 557     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 558     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 559     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 560     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 561     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 562     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 563 }
 564
 565 /**
 566  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 567  */
 568 static inline int check_intra4x4_pred_mode(H264Context *h){
 569     MpegEncContext * const s = &h->s;
 570     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 571     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 572     int i;
 573
 574     if(!(h->top_samples_available&0x8000)){
 575         for(i=0; i<4; i++){
 576             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 577             if(status<0){
 578                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 579                 return -1;
 580             } else if(status){
 581                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 582             }
 583         }
 584     }
 585
 586     if(!(h->left_samples_available&0x8000)){
 587         for(i=0; i<4; i++){
 588             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 589             if(status<0){
 590                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 591                 return -1;
 592             } else if(status){
 593                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 594             }
 595         }
 596     }
 597
 598     return 0;
 599 } //FIXME cleanup like next
 600
 601 /**
 602  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 603  */
 604 static inline int check_intra_pred_mode(H264Context *h, int mode){
 605     MpegEncContext * const s = &h->s;
 606     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 607     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 608
 609     if(mode > 6U) {
 610         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 611         return -1;
 612     }
 613
 614     if(!(h->top_samples_available&0x8000)){
 615         mode= top[ mode ];
 616         if(mode<0){
 617             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 618             return -1;
 619         }
 620     }
 621
 622     if(!(h->left_samples_available&0x8000)){
 623         mode= left[ mode ];
 624         if(mode<0){
 625             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 626             return -1;
 627         }
 628     }
 629
 630     return mode;
 631 }
 632
 633 /**
 634  * gets the predicted intra4x4 prediction mode.
 635  */
 636 static inline int pred_intra_mode(H264Context *h, int n){
 637     const int index8= scan8[n];
 638     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 639     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 640     const int min= FFMIN(left, top);
 641
 642     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 643
 644     if(min<0) return DC_PRED;
 645     else      return min;
 646 }
 647
 648 static inline void write_back_non_zero_count(H264Context *h){
 649     MpegEncContext * const s = &h->s;
 650     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 651
 652     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 653     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 654     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 655     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 656     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 657     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 658     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 659
 660     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 661     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 662     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 663
 664     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 665     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 666     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 667
 668     if(FRAME_MBAFF){
 669         // store all luma nnzs, for deblocking
 670         int v = 0, i;
 671         for(i=0; i<16; i++)
 672             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 673         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 674     }
 675 }
 676
 677 /**
 678  * gets the predicted number of non zero coefficients.
 679  * @param n block index
 680  */
 681 static inline int pred_non_zero_count(H264Context *h, int n){
 682     const int index8= scan8[n];
 683     const int left= h->non_zero_count_cache[index8 - 1];
 684     const int top = h->non_zero_count_cache[index8 - 8];
 685     int i= left + top;
 686
 687     if(i<64) i= (i+1)>>1;
 688
 689     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 690
 691     return i&31;
 692 }
 693
 694 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 695     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 696     MpegEncContext *s = &h->s;
 697
 698     /* there is no consistent mapping of mvs to neighboring locations that will
 699      * make mbaff happy, so we can't move all this logic to fill_caches */
 700     if(FRAME_MBAFF){
 701         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 702         const int16_t *mv;
 703         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 704         *C = h->mv_cache[list][scan8[0]-2];
 705
 706         if(!MB_FIELD
 707            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 708             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 709             if(IS_INTERLACED(mb_types[topright_xy])){
 710 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 711                 const int x4 = X4, y4 = Y4;\
 712                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 713                 if(!USES_LIST(mb_type,list))\
 714                     return LIST_NOT_USED;\
 715                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 716                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 717                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 718                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 719
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 721             }
 722         }
 723         if(topright_ref == PART_NOT_AVAILABLE
 724            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 725            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 726             if(!MB_FIELD
 727                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 728                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 729             }
 730             if(MB_FIELD
 731                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 732                && i >= scan8[0]+8){
 733                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 734                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 735             }
 736         }
 737 #undef SET_DIAG_MV
 738     }
 739
 740     if(topright_ref != PART_NOT_AVAILABLE){
 741         *C= h->mv_cache[list][ i - 8 + part_width ];
 742         return topright_ref;
 743     }else{
 744         tprintf(s->avctx, "topright MV not available\n");
 745
 746         *C= h->mv_cache[list][ i - 8 - 1 ];
 747         return h->ref_cache[list][ i - 8 - 1 ];
 748     }
 749 }
 750
 751 /**
 752  * gets the predicted MV.
 753  * @param n the block index
 754  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 755  * @param mx the x component of the predicted motion vector
 756  * @param my the y component of the predicted motion vector
 757  */
 758 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 759     const int index8= scan8[n];
 760     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 761     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 762     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 763     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 764     const int16_t * C;
 765     int diagonal_ref, match_count;
 766
 767     assert(part_width==1 || part_width==2 || part_width==4);
 768
 769 /* mv_cache
 770   B . . A T T T T
 771   U . . L . . , .
 772   U . . L . . . .
 773   U . . L . . , .
 774   . . . L . . . .
 775 */
 776
 777     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 778     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 779     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 780     if(match_count > 1){ //most common
 781         *mx= mid_pred(A[0], B[0], C[0]);
 782         *my= mid_pred(A[1], B[1], C[1]);
 783     }else if(match_count==1){
 784         if(left_ref==ref){
 785             *mx= A[0];
 786             *my= A[1];
 787         }else if(top_ref==ref){
 788             *mx= B[0];
 789             *my= B[1];
 790         }else{
 791             *mx= C[0];
 792             *my= C[1];
 793         }
 794     }else{
 795         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 796             *mx= A[0];
 797             *my= A[1];
 798         }else{
 799             *mx= mid_pred(A[0], B[0], C[0]);
 800             *my= mid_pred(A[1], B[1], C[1]);
 801         }
 802     }
 803
 804     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 805 }
 806
 807 /**
 808  * gets the directionally predicted 16x8 MV.
 809  * @param n the block index
 810  * @param mx the x component of the predicted motion vector
 811  * @param my the y component of the predicted motion vector
 812  */
 813 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 814     if(n==0){
 815         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 816         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 817
 818         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 819
 820         if(top_ref == ref){
 821             *mx= B[0];
 822             *my= B[1];
 823             return;
 824         }
 825     }else{
 826         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 827         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 828
 829         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 830
 831         if(left_ref == ref){
 832             *mx= A[0];
 833             *my= A[1];
 834             return;
 835         }
 836     }
 837
 838     //RARE
 839     pred_motion(h, n, 4, list, ref, mx, my);
 840 }
 841
 842 /**
 843  * gets the directionally predicted 8x16 MV.
 844  * @param n the block index
 845  * @param mx the x component of the predicted motion vector
 846  * @param my the y component of the predicted motion vector
 847  */
 848 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 849     if(n==0){
 850         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 851         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 852
 853         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 854
 855         if(left_ref == ref){
 856             *mx= A[0];
 857             *my= A[1];
 858             return;
 859         }
 860     }else{
 861         const int16_t * C;
 862         int diagonal_ref;
 863
 864         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 865
 866         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 867
 868         if(diagonal_ref == ref){
 869             *mx= C[0];
 870             *my= C[1];
 871             return;
 872         }
 873     }
 874
 875     //RARE
 876     pred_motion(h, n, 2, list, ref, mx, my);
 877 }
 878
 879 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 880     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 881     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 882
 883     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 884
 885     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 886        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 887        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 888
 889         *mx = *my = 0;
 890         return;
 891     }
 892
 893     pred_motion(h, 0, 4, 0, 0, mx, my);
 894
 895     return;
 896 }
 897
 898 static inline void direct_dist_scale_factor(H264Context * const h){
 899     const int poc = h->s.current_picture_ptr->poc;
 900     const int poc1 = h->ref_list[1][0].poc;
 901     int i;
 902     for(i=0; i<h->ref_count[0]; i++){
 903         int poc0 = h->ref_list[0][i].poc;
 904         int td = av_clip(poc1 - poc0, -128, 127);
 905         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 906             h->dist_scale_factor[i] = 256;
 907         }else{
 908             int tb = av_clip(poc - poc0, -128, 127);
 909             int tx = (16384 + (FFABS(td) >> 1)) / td;
 910             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 911         }
 912     }
 913     if(FRAME_MBAFF){
 914         for(i=0; i<h->ref_count[0]; i++){
 915             h->dist_scale_factor_field[2*i] =
 916             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 917         }
 918     }
 919 }
 920 static inline void direct_ref_list_init(H264Context * const h){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     Picture * const cur = s->current_picture_ptr;
 924     int list, i, j;
 925     if(cur->pict_type == FF_I_TYPE)
 926         cur->ref_count[0] = 0;
 927     if(cur->pict_type != FF_B_TYPE)
 928         cur->ref_count[1] = 0;
 929     for(list=0; list<2; list++){
 930         cur->ref_count[list] = h->ref_count[list];
 931         for(j=0; j<h->ref_count[list]; j++)
 932             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 933     }
 934     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 935         return;
 936     for(list=0; list<2; list++){
 937         for(i=0; i<ref1->ref_count[list]; i++){
 938             const int poc = ref1->ref_poc[list][i];
 939             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 940             for(j=0; j<h->ref_count[list]; j++)
 941                 if(h->ref_list[list][j].poc == poc){
 942                     h->map_col_to_list0[list][i] = j;
 943                     break;
 944                 }
 945         }
 946     }
 947     if(FRAME_MBAFF){
 948         for(list=0; list<2; list++){
 949             for(i=0; i<ref1->ref_count[list]; i++){
 950                 j = h->map_col_to_list0[list][i];
 951                 h->map_col_to_list0_field[list][2*i] = 2*j;
 952                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 953             }
 954         }
 955     }
 956 }
 957
 958 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 959     MpegEncContext * const s = &h->s;
 960     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
 961     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 962     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 963     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 964     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 965     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 966     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 967     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 968     const int is_b8x8 = IS_8X8(*mb_type);
 969     unsigned int sub_mb_type;
 970     int i8, i4;
 971
 972 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 973     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 974         /* FIXME save sub mb types from previous frames (or derive from MVs)
 975          * so we know exactly what block size to use */
 976         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 977         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 978     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 979         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 980         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 981     }else{
 982         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 983         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 984     }
 985     if(!is_b8x8)
 986         *mb_type |= MB_TYPE_DIRECT2;
 987     if(MB_FIELD)
 988         *mb_type |= MB_TYPE_INTERLACED;
 989
 990     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 991
 992     if(h->direct_spatial_mv_pred){
 993         int ref[2];
 994         int mv[2][2];
 995         int list;
 996
 997         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 998
 999         /* ref = min(neighbors) */
1000         for(list=0; list<2; list++){
1001             int refa = h->ref_cache[list][scan8[0] - 1];
1002             int refb = h->ref_cache[list][scan8[0] - 8];
1003             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1004             if(refc == -2)
1005                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1006             ref[list] = refa;
1007             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1008                 ref[list] = refb;
1009             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1010                 ref[list] = refc;
1011             if(ref[list] < 0)
1012                 ref[list] = -1;
1013         }
1014
1015         if(ref[0] < 0 && ref[1] < 0){
1016             ref[0] = ref[1] = 0;
1017             mv[0][0] = mv[0][1] =
1018             mv[1][0] = mv[1][1] = 0;
1019         }else{
1020             for(list=0; list<2; list++){
1021                 if(ref[list] >= 0)
1022                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1023                 else
1024                     mv[list][0] = mv[list][1] = 0;
1025             }
1026         }
1027
1028         if(ref[1] < 0){
1029             if(!is_b8x8)
1030                 *mb_type &= ~MB_TYPE_L1;
1031             sub_mb_type &= ~MB_TYPE_L1;
1032         }else if(ref[0] < 0){
1033             if(!is_b8x8)
1034                 *mb_type &= ~MB_TYPE_L0;
1035             sub_mb_type &= ~MB_TYPE_L0;
1036         }
1037
1038         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1039             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1040             int mb_types_col[2];
1041             int b8_stride = h->b8_stride;
1042             int b4_stride = h->b_stride;
1043
1044             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1045
1046             if(IS_INTERLACED(*mb_type)){
1047                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1048                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1049                 if(s->mb_y&1){
1050                     l1ref0 -= 2*b8_stride;
1051                     l1ref1 -= 2*b8_stride;
1052                     l1mv0 -= 4*b4_stride;
1053                     l1mv1 -= 4*b4_stride;
1054                 }
1055                 b8_stride *= 3;
1056                 b4_stride *= 6;
1057             }else{
1058                 int cur_poc = s->current_picture_ptr->poc;
1059                 int *col_poc = h->ref_list[1]->field_poc;
1060                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1061                 int dy = 2*col_parity - (s->mb_y&1);
1062                 mb_types_col[0] =
1063                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1064                 l1ref0 += dy*b8_stride;
1065                 l1ref1 += dy*b8_stride;
1066                 l1mv0 += 2*dy*b4_stride;
1067                 l1mv1 += 2*dy*b4_stride;
1068                 b8_stride = 0;
1069             }
1070
1071             for(i8=0; i8<4; i8++){
1072                 int x8 = i8&1;
1073                 int y8 = i8>>1;
1074                 int xy8 = x8+y8*b8_stride;
1075                 int xy4 = 3*x8+y8*b4_stride;
1076                 int a=0, b=0;
1077
1078                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1079                     continue;
1080                 h->sub_mb_type[i8] = sub_mb_type;
1081
1082                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1083                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1084                 if(!IS_INTRA(mb_types_col[y8])
1085                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1086                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1087                     if(ref[0] > 0)
1088                         a= pack16to32(mv[0][0],mv[0][1]);
1089                     if(ref[1] > 0)
1090                         b= pack16to32(mv[1][0],mv[1][1]);
1091                 }else{
1092                     a= pack16to32(mv[0][0],mv[0][1]);
1093                     b= pack16to32(mv[1][0],mv[1][1]);
1094                 }
1095                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1096                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1097             }
1098         }else if(IS_16X16(*mb_type)){
1099             int a=0, b=0;
1100
1101             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1102             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1103             if(!IS_INTRA(mb_type_col)
1104                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1105                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1106                        && (h->x264_build>33 || !h->x264_build)))){
1107                 if(ref[0] > 0)
1108                     a= pack16to32(mv[0][0],mv[0][1]);
1109                 if(ref[1] > 0)
1110                     b= pack16to32(mv[1][0],mv[1][1]);
1111             }else{
1112                 a= pack16to32(mv[0][0],mv[0][1]);
1113                 b= pack16to32(mv[1][0],mv[1][1]);
1114             }
1115             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1116             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1117         }else{
1118             for(i8=0; i8<4; i8++){
1119                 const int x8 = i8&1;
1120                 const int y8 = i8>>1;
1121
1122                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1123                     continue;
1124                 h->sub_mb_type[i8] = sub_mb_type;
1125
1126                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1127                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1128                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1129                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1130
1131                 /* col_zero_flag */
1132                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1133                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1134                                                   && (h->x264_build>33 || !h->x264_build)))){
1135                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1136                     if(IS_SUB_8X8(sub_mb_type)){
1137                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1138                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1139                             if(ref[0] == 0)
1140                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1141                             if(ref[1] == 0)
1142                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1143                         }
1144                     }else
1145                     for(i4=0; i4<4; i4++){
1146                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1147                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1148                             if(ref[0] == 0)
1149                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1150                             if(ref[1] == 0)
1151                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1152                         }
1153                     }
1154                 }
1155             }
1156         }
1157     }else{ /* direct temporal mv pred */
1158         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1159         const int *dist_scale_factor = h->dist_scale_factor;
1160
1161         if(FRAME_MBAFF){
1162             if(IS_INTERLACED(*mb_type)){
1163                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1164                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1165                 dist_scale_factor = h->dist_scale_factor_field;
1166             }
1167             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1168                 /* FIXME assumes direct_8x8_inference == 1 */
1169                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1170                 int mb_types_col[2];
1171                 int y_shift;
1172
1173                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1174                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1175                          | (*mb_type & MB_TYPE_INTERLACED);
1176                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1177
1178                 if(IS_INTERLACED(*mb_type)){
1179                     /* frame to field scaling */
1180                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1181                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1182                     if(s->mb_y&1){
1183                         l1ref0 -= 2*h->b8_stride;
1184                         l1ref1 -= 2*h->b8_stride;
1185                         l1mv0 -= 4*h->b_stride;
1186                         l1mv1 -= 4*h->b_stride;
1187                     }
1188                     y_shift = 0;
1189
1190                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1191                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1192                        && !is_b8x8)
1193                         *mb_type |= MB_TYPE_16x8;
1194                     else
1195                         *mb_type |= MB_TYPE_8x8;
1196                 }else{
1197                     /* field to frame scaling */
1198                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1199                      * but in MBAFF, top and bottom POC are equal */
1200                     int dy = (s->mb_y&1) ? 1 : 2;
1201                     mb_types_col[0] =
1202                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1203                     l1ref0 += dy*h->b8_stride;
1204                     l1ref1 += dy*h->b8_stride;
1205                     l1mv0 += 2*dy*h->b_stride;
1206                     l1mv1 += 2*dy*h->b_stride;
1207                     y_shift = 2;
1208
1209                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1210                        && !is_b8x8)
1211                         *mb_type |= MB_TYPE_16x16;
1212                     else
1213                         *mb_type |= MB_TYPE_8x8;
1214                 }
1215
1216                 for(i8=0; i8<4; i8++){
1217                     const int x8 = i8&1;
1218                     const int y8 = i8>>1;
1219                     int ref0, scale;
1220                     const int16_t (*l1mv)[2]= l1mv0;
1221
1222                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1223                         continue;
1224                     h->sub_mb_type[i8] = sub_mb_type;
1225
1226                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1227                     if(IS_INTRA(mb_types_col[y8])){
1228                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1229                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1230                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1231                         continue;
1232                     }
1233
1234                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1235                     if(ref0 >= 0)
1236                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1237                     else{
1238                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1239                         l1mv= l1mv1;
1240                     }
1241                     scale = dist_scale_factor[ref0];
1242                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1243
1244                     {
1245                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1246                         int my_col = (mv_col[1]<<y_shift)/2;
1247                         int mx = (scale * mv_col[0] + 128) >> 8;
1248                         int my = (scale * my_col + 128) >> 8;
1249                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1250                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1251                     }
1252                 }
1253                 return;
1254             }
1255         }
1256
1257         /* one-to-one mv scaling */
1258
1259         if(IS_16X16(*mb_type)){
1260             int ref, mv0, mv1;
1261
1262             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1263             if(IS_INTRA(mb_type_col)){
1264                 ref=mv0=mv1=0;
1265             }else{
1266                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1267                                                 : map_col_to_list0[1][l1ref1[0]];
1268                 const int scale = dist_scale_factor[ref0];
1269                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1270                 int mv_l0[2];
1271                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1272                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1273                 ref= ref0;
1274                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1275                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1276             }
1277             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1278             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1279             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1280         }else{
1281             for(i8=0; i8<4; i8++){
1282                 const int x8 = i8&1;
1283                 const int y8 = i8>>1;
1284                 int ref0, scale;
1285                 const int16_t (*l1mv)[2]= l1mv0;
1286
1287                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1288                     continue;
1289                 h->sub_mb_type[i8] = sub_mb_type;
1290                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1291                 if(IS_INTRA(mb_type_col)){
1292                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1293                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1294                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1295                     continue;
1296                 }
1297
1298                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1299                 if(ref0 >= 0)
1300                     ref0 = map_col_to_list0[0][ref0];
1301                 else{
1302                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1303                     l1mv= l1mv1;
1304                 }
1305                 scale = dist_scale_factor[ref0];
1306
1307                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1308                 if(IS_SUB_8X8(sub_mb_type)){
1309                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1310                     int mx = (scale * mv_col[0] + 128) >> 8;
1311                     int my = (scale * mv_col[1] + 128) >> 8;
1312                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1313                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1314                 }else
1315                 for(i4=0; i4<4; i4++){
1316                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1317                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1318                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1319                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1320                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1321                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1322                 }
1323             }
1324         }
1325     }
1326 }
1327
1328 static inline void write_back_motion(H264Context *h, int mb_type){
1329     MpegEncContext * const s = &h->s;
1330     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1331     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1332     int list;
1333
1334     if(!USES_LIST(mb_type, 0))
1335         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1336
1337     for(list=0; list<h->list_count; list++){
1338         int y;
1339         if(!USES_LIST(mb_type, list))
1340             continue;
1341
1342         for(y=0; y<4; y++){
1343             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1344             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1345         }
1346         if( h->pps.cabac ) {
1347             if(IS_SKIP(mb_type))
1348                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1349             else
1350             for(y=0; y<4; y++){
1351                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1352                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1353             }
1354         }
1355
1356         {
1357             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1358             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1359             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1360             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1361             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1362         }
1363     }
1364
1365     if(h->slice_type == FF_B_TYPE && h->pps.cabac){
1366         if(IS_8X8(mb_type)){
1367             uint8_t *direct_table = &h->direct_table[b8_xy];
1368             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1369             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1370             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1371         }
1372     }
1373 }
1374
1375 /**
1376  * Decodes a network abstraction layer unit.
1377  * @param consumed is the number of bytes used as input
1378  * @param length is the length of the array
1379  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1380  * @returns decoded bytes, might be src+1 if no escapes
1381  */
1382 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1383     int i, si, di;
1384     uint8_t *dst;
1385     int bufidx;
1386
1387 //    src[0]&0x80;                //forbidden bit
1388     h->nal_ref_idc= src[0]>>5;
1389     h->nal_unit_type= src[0]&0x1F;
1390
1391     src++; length--;
1392 #if 0
1393     for(i=0; i<length; i++)
1394         printf("%2X ", src[i]);
1395 #endif
1396     for(i=0; i+1<length; i+=2){
1397         if(src[i]) continue;
1398         if(i>0 && src[i-1]==0) i--;
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406     }
1407
1408     if(i>=length-1){ //no escaped 0
1409         *dst_length= length;
1410         *consumed= length+1; //+1 for the header
1411         return src;
1412     }
1413
1414     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1415     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1416     dst= h->rbsp_buffer[bufidx];
1417
1418     if (dst == NULL){
1419         return NULL;
1420     }
1421
1422 //printf("decoding esc\n");
1423     si=di=0;
1424     while(si<length){
1425         //remove escapes (very rare 1:2^22)
1426         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1427             if(src[si+2]==3){ //escape
1428                 dst[di++]= 0;
1429                 dst[di++]= 0;
1430                 si+=3;
1431                 continue;
1432             }else //next start code
1433                 break;
1434         }
1435
1436         dst[di++]= src[si++];
1437     }
1438
1439     *dst_length= di;
1440     *consumed= si + 1;//+1 for the header
1441 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1442     return dst;
1443 }
1444
1445 /**
1446  * identifies the exact end of the bitstream
1447  * @return the length of the trailing, or 0 if damaged
1448  */
1449 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1450     int v= *src;
1451     int r;
1452
1453     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1454
1455     for(r=1; r<9; r++){
1456         if(v&1) return r;
1457         v>>=1;
1458     }
1459     return 0;
1460 }
1461
1462 /**
1463  * idct tranforms the 16 dc values and dequantize them.
1464  * @param qp quantization parameter
1465  */
1466 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1467 #define stride 16
1468     int i;
1469     int temp[16]; //FIXME check if this is a good idea
1470     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1471     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1472
1473 //memset(block, 64, 2*256);
1474 //return;
1475     for(i=0; i<4; i++){
1476         const int offset= y_offset[i];
1477         const int z0= block[offset+stride*0] + block[offset+stride*4];
1478         const int z1= block[offset+stride*0] - block[offset+stride*4];
1479         const int z2= block[offset+stride*1] - block[offset+stride*5];
1480         const int z3= block[offset+stride*1] + block[offset+stride*5];
1481
1482         temp[4*i+0]= z0+z3;
1483         temp[4*i+1]= z1+z2;
1484         temp[4*i+2]= z1-z2;
1485         temp[4*i+3]= z0-z3;
1486     }
1487
1488     for(i=0; i<4; i++){
1489         const int offset= x_offset[i];
1490         const int z0= temp[4*0+i] + temp[4*2+i];
1491         const int z1= temp[4*0+i] - temp[4*2+i];
1492         const int z2= temp[4*1+i] - temp[4*3+i];
1493         const int z3= temp[4*1+i] + temp[4*3+i];
1494
1495         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1496         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1497         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1498         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1499     }
1500 }
1501
1502 #if 0
1503 /**
1504  * dct tranforms the 16 dc values.
1505  * @param qp quantization parameter ??? FIXME
1506  */
1507 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1508 //    const int qmul= dequant_coeff[qp][0];
1509     int i;
1510     int temp[16]; //FIXME check if this is a good idea
1511     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1512     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1513
1514     for(i=0; i<4; i++){
1515         const int offset= y_offset[i];
1516         const int z0= block[offset+stride*0] + block[offset+stride*4];
1517         const int z1= block[offset+stride*0] - block[offset+stride*4];
1518         const int z2= block[offset+stride*1] - block[offset+stride*5];
1519         const int z3= block[offset+stride*1] + block[offset+stride*5];
1520
1521         temp[4*i+0]= z0+z3;
1522         temp[4*i+1]= z1+z2;
1523         temp[4*i+2]= z1-z2;
1524         temp[4*i+3]= z0-z3;
1525     }
1526
1527     for(i=0; i<4; i++){
1528         const int offset= x_offset[i];
1529         const int z0= temp[4*0+i] + temp[4*2+i];
1530         const int z1= temp[4*0+i] - temp[4*2+i];
1531         const int z2= temp[4*1+i] - temp[4*3+i];
1532         const int z3= temp[4*1+i] + temp[4*3+i];
1533
1534         block[stride*0 +offset]= (z0 + z3)>>1;
1535         block[stride*2 +offset]= (z1 + z2)>>1;
1536         block[stride*8 +offset]= (z1 - z2)>>1;
1537         block[stride*10+offset]= (z0 - z3)>>1;
1538     }
1539 }
1540 #endif
1541
1542 #undef xStride
1543 #undef stride
1544
1545 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1546     const int stride= 16*2;
1547     const int xStride= 16;
1548     int a,b,c,d,e;
1549
1550     a= block[stride*0 + xStride*0];
1551     b= block[stride*0 + xStride*1];
1552     c= block[stride*1 + xStride*0];
1553     d= block[stride*1 + xStride*1];
1554
1555     e= a-b;
1556     a= a+b;
1557     b= c-d;
1558     c= c+d;
1559
1560     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1561     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1562     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1563     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1564 }
1565
1566 #if 0
1567 static void chroma_dc_dct_c(DCTELEM *block){
1568     const int stride= 16*2;
1569     const int xStride= 16;
1570     int a,b,c,d,e;
1571
1572     a= block[stride*0 + xStride*0];
1573     b= block[stride*0 + xStride*1];
1574     c= block[stride*1 + xStride*0];
1575     d= block[stride*1 + xStride*1];
1576
1577     e= a-b;
1578     a= a+b;
1579     b= c-d;
1580     c= c+d;
1581
1582     block[stride*0 + xStride*0]= (a+c);
1583     block[stride*0 + xStride*1]= (e+b);
1584     block[stride*1 + xStride*0]= (a-c);
1585     block[stride*1 + xStride*1]= (e-b);
1586 }
1587 #endif
1588
1589 /**
1590  * gets the chroma qp.
1591  */
1592 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1593     return h->pps.chroma_qp_table[t][qscale & 0xff];
1594 }
1595
1596 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1597 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1598 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1599     int i;
1600     const int * const quant_table= quant_coeff[qscale];
1601     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1602     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1603     const unsigned int threshold2= (threshold1<<1);
1604     int last_non_zero;
1605
1606     if(separate_dc){
1607         if(qscale<=18){
1608             //avoid overflows
1609             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1610             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1611             const unsigned int dc_threshold2= (dc_threshold1<<1);
1612
1613             int level= block[0]*quant_coeff[qscale+18][0];
1614             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1615                 if(level>0){
1616                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1617                     block[0]= level;
1618                 }else{
1619                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1620                     block[0]= -level;
1621                 }
1622 //                last_non_zero = i;
1623             }else{
1624                 block[0]=0;
1625             }
1626         }else{
1627             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1628             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1629             const unsigned int dc_threshold2= (dc_threshold1<<1);
1630
1631             int level= block[0]*quant_table[0];
1632             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1633                 if(level>0){
1634                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1635                     block[0]= level;
1636                 }else{
1637                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1638                     block[0]= -level;
1639                 }
1640 //                last_non_zero = i;
1641             }else{
1642                 block[0]=0;
1643             }
1644         }
1645         last_non_zero= 0;
1646         i=1;
1647     }else{
1648         last_non_zero= -1;
1649         i=0;
1650     }
1651
1652     for(; i<16; i++){
1653         const int j= scantable[i];
1654         int level= block[j]*quant_table[j];
1655
1656 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1657 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1658         if(((unsigned)(level+threshold1))>threshold2){
1659             if(level>0){
1660                 level= (bias + level)>>QUANT_SHIFT;
1661                 block[j]= level;
1662             }else{
1663                 level= (bias - level)>>QUANT_SHIFT;
1664                 block[j]= -level;
1665             }
1666             last_non_zero = i;
1667         }else{
1668             block[j]=0;
1669         }
1670     }
1671
1672     return last_non_zero;
1673 }
1674
1675 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1676                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1677                            int src_x_offset, int src_y_offset,
1678                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1679     MpegEncContext * const s = &h->s;
1680     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1681     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1682     const int luma_xy= (mx&3) + ((my&3)<<2);
1683     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1684     uint8_t * src_cb, * src_cr;
1685     int extra_width= h->emu_edge_width;
1686     int extra_height= h->emu_edge_height;
1687     int emu=0;
1688     const int full_mx= mx>>2;
1689     const int full_my= my>>2;
1690     const int pic_width  = 16*s->mb_width;
1691     const int pic_height = 16*s->mb_height >> MB_FIELD;
1692
1693     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1694         return;
1695
1696     if(mx&7) extra_width -= 3;
1697     if(my&7) extra_height -= 3;
1698
1699     if(   full_mx < 0-extra_width
1700        || full_my < 0-extra_height
1701        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1702        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1703         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1704             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1705         emu=1;
1706     }
1707
1708     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1709     if(!square){
1710         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1711     }
1712
1713     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1714
1715     if(MB_FIELD){
1716         // chroma offset when predicting from a field of opposite parity
1717         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1718         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1719     }
1720     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1721     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cb= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728
1729     if(emu){
1730         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1731             src_cr= s->edge_emu_buffer;
1732     }
1733     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1734 }
1735
1736 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1737                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1738                            int x_offset, int y_offset,
1739                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1740                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1741                            int list0, int list1){
1742     MpegEncContext * const s = &h->s;
1743     qpel_mc_func *qpix_op=  qpix_put;
1744     h264_chroma_mc_func chroma_op= chroma_put;
1745
1746     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1747     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1748     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1749     x_offset += 8*s->mb_x;
1750     y_offset += 8*(s->mb_y >> MB_FIELD);
1751
1752     if(list0){
1753         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1754         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1755                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1756                            qpix_op, chroma_op);
1757
1758         qpix_op=  qpix_avg;
1759         chroma_op= chroma_avg;
1760     }
1761
1762     if(list1){
1763         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1764         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1765                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1766                            qpix_op, chroma_op);
1767     }
1768 }
1769
1770 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1771                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1772                            int x_offset, int y_offset,
1773                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1774                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1775                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1776                            int list0, int list1){
1777     MpegEncContext * const s = &h->s;
1778
1779     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1780     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1781     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1782     x_offset += 8*s->mb_x;
1783     y_offset += 8*(s->mb_y >> MB_FIELD);
1784
1785     if(list0 && list1){
1786         /* don't optimize for luma-only case, since B-frames usually
1787          * use implicit weights => chroma too. */
1788         uint8_t *tmp_cb = s->obmc_scratchpad;
1789         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1790         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1791         int refn0 = h->ref_cache[0][ scan8[n] ];
1792         int refn1 = h->ref_cache[1][ scan8[n] ];
1793
1794         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1795                     dest_y, dest_cb, dest_cr,
1796                     x_offset, y_offset, qpix_put, chroma_put);
1797         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1798                     tmp_y, tmp_cb, tmp_cr,
1799                     x_offset, y_offset, qpix_put, chroma_put);
1800
1801         if(h->use_weight == 2){
1802             int weight0 = h->implicit_weight[refn0][refn1];
1803             int weight1 = 64 - weight0;
1804             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1806             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1807         }else{
1808             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1809                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1810                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1811             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1812                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1813                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1814             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1815                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1816                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1817         }
1818     }else{
1819         int list = list1 ? 1 : 0;
1820         int refn = h->ref_cache[list][ scan8[n] ];
1821         Picture *ref= &h->ref_list[list][refn];
1822         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1823                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1824                     qpix_put, chroma_put);
1825
1826         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1827                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1828         if(h->use_weight_chroma){
1829             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1831             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1832                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1833         }
1834     }
1835 }
1836
1837 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1838                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1839                            int x_offset, int y_offset,
1840                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1841                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1842                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1843                            int list0, int list1){
1844     if((h->use_weight==2 && list0 && list1
1845         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1846        || h->use_weight==1)
1847         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1848                          x_offset, y_offset, qpix_put, chroma_put,
1849                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1850     else
1851         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1852                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1853 }
1854
1855 static inline void prefetch_motion(H264Context *h, int list){
1856     /* fetch pixels for estimated mv 4 macroblocks ahead
1857      * optimized for 64byte cache lines */
1858     MpegEncContext * const s = &h->s;
1859     const int refn = h->ref_cache[list][scan8[0]];
1860     if(refn >= 0){
1861         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1862         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1863         uint8_t **src= h->ref_list[list][refn].data;
1864         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1865         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1866         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1867         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1868     }
1869 }
1870
1871 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1872                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1873                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1874                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1875     MpegEncContext * const s = &h->s;
1876     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1877     const int mb_type= s->current_picture.mb_type[mb_xy];
1878
1879     assert(IS_INTER(mb_type));
1880
1881     prefetch_motion(h, 0);
1882
1883     if(IS_16X16(mb_type)){
1884         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1885                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1886                 &weight_op[0], &weight_avg[0],
1887                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1888     }else if(IS_16X8(mb_type)){
1889         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1890                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1891                 &weight_op[1], &weight_avg[1],
1892                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1893         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1894                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1895                 &weight_op[1], &weight_avg[1],
1896                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1897     }else if(IS_8X16(mb_type)){
1898         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1899                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1900                 &weight_op[2], &weight_avg[2],
1901                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1902         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1903                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1904                 &weight_op[2], &weight_avg[2],
1905                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1906     }else{
1907         int i;
1908
1909         assert(IS_8X8(mb_type));
1910
1911         for(i=0; i<4; i++){
1912             const int sub_mb_type= h->sub_mb_type[i];
1913             const int n= 4*i;
1914             int x_offset= (i&1)<<2;
1915             int y_offset= (i&2)<<1;
1916
1917             if(IS_SUB_8X8(sub_mb_type)){
1918                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1919                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1920                     &weight_op[3], &weight_avg[3],
1921                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1922             }else if(IS_SUB_8X4(sub_mb_type)){
1923                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1924                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1925                     &weight_op[4], &weight_avg[4],
1926                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1927                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1928                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1929                     &weight_op[4], &weight_avg[4],
1930                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1931             }else if(IS_SUB_4X8(sub_mb_type)){
1932                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1933                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1934                     &weight_op[5], &weight_avg[5],
1935                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1936                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1937                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1938                     &weight_op[5], &weight_avg[5],
1939                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1940             }else{
1941                 int j;
1942                 assert(IS_SUB_4X4(sub_mb_type));
1943                 for(j=0; j<4; j++){
1944                     int sub_x_offset= x_offset + 2*(j&1);
1945                     int sub_y_offset= y_offset +   (j&2);
1946                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1947                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1948                         &weight_op[6], &weight_avg[6],
1949                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1950                 }
1951             }
1952         }
1953     }
1954
1955     prefetch_motion(h, 1);
1956 }
1957
1958 static av_cold void decode_init_vlc(void){
1959     static int done = 0;
1960
1961     if (!done) {
1962         int i;
1963         done = 1;
1964
1965         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1966                  &chroma_dc_coeff_token_len [0], 1, 1,
1967                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1968
1969         for(i=0; i<4; i++){
1970             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1971                      &coeff_token_len [i][0], 1, 1,
1972                      &coeff_token_bits[i][0], 1, 1, 1);
1973         }
1974
1975         for(i=0; i<3; i++){
1976             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1977                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1978                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1979         }
1980         for(i=0; i<15; i++){
1981             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1982                      &total_zeros_len [i][0], 1, 1,
1983                      &total_zeros_bits[i][0], 1, 1, 1);
1984         }
1985
1986         for(i=0; i<6; i++){
1987             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1988                      &run_len [i][0], 1, 1,
1989                      &run_bits[i][0], 1, 1, 1);
1990         }
1991         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1992                  &run_len [6][0], 1, 1,
1993                  &run_bits[6][0], 1, 1, 1);
1994     }
1995 }
1996
1997 static void free_tables(H264Context *h){
1998     int i;
1999     H264Context *hx;
2000     av_freep(&h->intra4x4_pred_mode);
2001     av_freep(&h->chroma_pred_mode_table);
2002     av_freep(&h->cbp_table);
2003     av_freep(&h->mvd_table[0]);
2004     av_freep(&h->mvd_table[1]);
2005     av_freep(&h->direct_table);
2006     av_freep(&h->non_zero_count);
2007     av_freep(&h->slice_table_base);
2008     h->slice_table= NULL;
2009
2010     av_freep(&h->mb2b_xy);
2011     av_freep(&h->mb2b8_xy);
2012
2013     for(i = 0; i < MAX_SPS_COUNT; i++)
2014         av_freep(h->sps_buffers + i);
2015
2016     for(i = 0; i < MAX_PPS_COUNT; i++)
2017         av_freep(h->pps_buffers + i);
2018
2019     for(i = 0; i < h->s.avctx->thread_count; i++) {
2020         hx = h->thread_context[i];
2021         if(!hx) continue;
2022         av_freep(&hx->top_borders[1]);
2023         av_freep(&hx->top_borders[0]);
2024         av_freep(&hx->s.obmc_scratchpad);
2025     }
2026 }
2027
2028 static void init_dequant8_coeff_table(H264Context *h){
2029     int i,q,x;
2030     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2031     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2032     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2033
2034     for(i=0; i<2; i++ ){
2035         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2036             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2037             break;
2038         }
2039
2040         for(q=0; q<52; q++){
2041             int shift = ff_div6[q];
2042             int idx = ff_rem6[q];
2043             for(x=0; x<64; x++)
2044                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2045                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2046                     h->pps.scaling_matrix8[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant4_coeff_table(H264Context *h){
2052     int i,j,q,x;
2053     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2054     for(i=0; i<6; i++ ){
2055         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2056         for(j=0; j<i; j++){
2057             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2058                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2059                 break;
2060             }
2061         }
2062         if(j<i)
2063             continue;
2064
2065         for(q=0; q<52; q++){
2066             int shift = ff_div6[q] + 2;
2067             int idx = ff_rem6[q];
2068             for(x=0; x<16; x++)
2069                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2070                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2071                     h->pps.scaling_matrix4[i][x]) << shift;
2072         }
2073     }
2074 }
2075
2076 static void init_dequant_tables(H264Context *h){
2077     int i,x;
2078     init_dequant4_coeff_table(h);
2079     if(h->pps.transform_8x8_mode)
2080         init_dequant8_coeff_table(h);
2081     if(h->sps.transform_bypass){
2082         for(i=0; i<6; i++)
2083             for(x=0; x<16; x++)
2084                 h->dequant4_coeff[i][0][x] = 1<<6;
2085         if(h->pps.transform_8x8_mode)
2086             for(i=0; i<2; i++)
2087                 for(x=0; x<64; x++)
2088                     h->dequant8_coeff[i][0][x] = 1<<6;
2089     }
2090 }
2091
2092
2093 /**
2094  * allocates tables.
2095  * needs width/height
2096  */
2097 static int alloc_tables(H264Context *h){
2098     MpegEncContext * const s = &h->s;
2099     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2100     int x,y;
2101
2102     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2103
2104     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2105     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2106     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2107
2108     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2109     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2110     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2111     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2112
2113     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2114     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2115
2116     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2117     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2118     for(y=0; y<s->mb_height; y++){
2119         for(x=0; x<s->mb_width; x++){
2120             const int mb_xy= x + y*s->mb_stride;
2121             const int b_xy = 4*x + 4*y*h->b_stride;
2122             const int b8_xy= 2*x + 2*y*h->b8_stride;
2123
2124             h->mb2b_xy [mb_xy]= b_xy;
2125             h->mb2b8_xy[mb_xy]= b8_xy;
2126         }
2127     }
2128
2129     s->obmc_scratchpad = NULL;
2130
2131     if(!h->dequant4_coeff[0])
2132         init_dequant_tables(h);
2133
2134     return 0;
2135 fail:
2136     free_tables(h);
2137     return -1;
2138 }
2139
2140 /**
2141  * Mimic alloc_tables(), but for every context thread.
2142  */
2143 static void clone_tables(H264Context *dst, H264Context *src){
2144     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2145     dst->non_zero_count           = src->non_zero_count;
2146     dst->slice_table              = src->slice_table;
2147     dst->cbp_table                = src->cbp_table;
2148     dst->mb2b_xy                  = src->mb2b_xy;
2149     dst->mb2b8_xy                 = src->mb2b8_xy;
2150     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2151     dst->mvd_table[0]             = src->mvd_table[0];
2152     dst->mvd_table[1]             = src->mvd_table[1];
2153     dst->direct_table             = src->direct_table;
2154
2155     dst->s.obmc_scratchpad = NULL;
2156     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2157 }
2158
2159 /**
2160  * Init context
2161  * Allocate buffers which are not shared amongst multiple threads.
2162  */
2163 static int context_init(H264Context *h){
2164     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2165     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2166
2167     return 0;
2168 fail:
2169     return -1; // free_tables will clean up for us
2170 }
2171
2172 static av_cold void common_init(H264Context *h){
2173     MpegEncContext * const s = &h->s;
2174
2175     s->width = s->avctx->width;
2176     s->height = s->avctx->height;
2177     s->codec_id= s->avctx->codec->id;
2178
2179     ff_h264_pred_init(&h->hpc, s->codec_id);
2180
2181     h->dequant_coeff_pps= -1;
2182     s->unrestricted_mv=1;
2183     s->decode=1; //FIXME
2184
2185     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2186     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2187 }
2188
2189 static av_cold int decode_init(AVCodecContext *avctx){
2190     H264Context *h= avctx->priv_data;
2191     MpegEncContext * const s = &h->s;
2192
2193     MPV_decode_defaults(s);
2194
2195     s->avctx = avctx;
2196     common_init(h);
2197
2198     s->out_format = FMT_H264;
2199     s->workaround_bugs= avctx->workaround_bugs;
2200
2201     // set defaults
2202 //    s->decode_mb= ff_h263_decode_mb;
2203     s->quarter_sample = 1;
2204     s->low_delay= 1;
2205     avctx->pix_fmt= PIX_FMT_YUV420P;
2206
2207     decode_init_vlc();
2208
2209     if(avctx->extradata_size > 0 && avctx->extradata &&
2210        *(char *)avctx->extradata == 1){
2211         h->is_avc = 1;
2212         h->got_avcC = 0;
2213     } else {
2214         h->is_avc = 0;
2215     }
2216
2217     h->thread_context[0] = h;
2218     return 0;
2219 }
2220
2221 static int frame_start(H264Context *h){
2222     MpegEncContext * const s = &h->s;
2223     int i;
2224
2225     if(MPV_frame_start(s, s->avctx) < 0)
2226         return -1;
2227     ff_er_frame_start(s);
2228     /*
2229      * MPV_frame_start uses pict_type to derive key_frame.
2230      * This is incorrect for H.264; IDR markings must be used.
2231      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2232      * See decode_nal_units().
2233      */
2234     s->current_picture_ptr->key_frame= 0;
2235
2236     assert(s->linesize && s->uvlinesize);
2237
2238     for(i=0; i<16; i++){
2239         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2240         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2241     }
2242     for(i=0; i<4; i++){
2243         h->block_offset[16+i]=
2244         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245         h->block_offset[24+16+i]=
2246         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2247     }
2248
2249     /* can't be in alloc_tables because linesize isn't known there.
2250      * FIXME: redo bipred weight to not require extra buffer? */
2251     for(i = 0; i < s->avctx->thread_count; i++)
2252         if(!h->thread_context[i]->s.obmc_scratchpad)
2253             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2254
2255     /* some macroblocks will be accessed before they're available */
2256     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2257         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2258
2259 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2260     return 0;
2261 }
2262
2263 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2264     MpegEncContext * const s = &h->s;
2265     int i;
2266
2267     src_y  -=   linesize;
2268     src_cb -= uvlinesize;
2269     src_cr -= uvlinesize;
2270
2271     // There are two lines saved, the line above the the top macroblock of a pair,
2272     // and the line above the bottom macroblock
2273     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2274     for(i=1; i<17; i++){
2275         h->left_border[i]= src_y[15+i*  linesize];
2276     }
2277
2278     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2279     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2280
2281     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2282         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2283         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2284         for(i=1; i<9; i++){
2285             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2286             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2287         }
2288         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2289         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2290     }
2291 }
2292
2293 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2294     MpegEncContext * const s = &h->s;
2295     int temp8, i;
2296     uint64_t temp64;
2297     int deblock_left;
2298     int deblock_top;
2299     int mb_xy;
2300
2301     if(h->deblocking_filter == 2) {
2302         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2303         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2304         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2305     } else {
2306         deblock_left = (s->mb_x > 0);
2307         deblock_top =  (s->mb_y > 0);
2308     }
2309
2310     src_y  -=   linesize + 1;
2311     src_cb -= uvlinesize + 1;
2312     src_cr -= uvlinesize + 1;
2313
2314 #define XCHG(a,b,t,xchg)\
2315 t= a;\
2316 if(xchg)\
2317     a= b;\
2318 b= t;
2319
2320     if(deblock_left){
2321         for(i = !deblock_top; i<17; i++){
2322             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2323         }
2324     }
2325
2326     if(deblock_top){
2327         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2328         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2329         if(s->mb_x+1 < s->mb_width){
2330             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2331         }
2332     }
2333
2334     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2335         if(deblock_left){
2336             for(i = !deblock_top; i<9; i++){
2337                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2338                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2339             }
2340         }
2341         if(deblock_top){
2342             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2343             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2344         }
2345     }
2346 }
2347
2348 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2349     MpegEncContext * const s = &h->s;
2350     int i;
2351
2352     src_y  -= 2 *   linesize;
2353     src_cb -= 2 * uvlinesize;
2354     src_cr -= 2 * uvlinesize;
2355
2356     // There are two lines saved, the line above the the top macroblock of a pair,
2357     // and the line above the bottom macroblock
2358     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2359     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2360     for(i=2; i<34; i++){
2361         h->left_border[i]= src_y[15+i*  linesize];
2362     }
2363
2364     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2365     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2366     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2367     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2368
2369     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2370         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2371         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2372         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2373         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2374         for(i=2; i<18; i++){
2375             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2376             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2377         }
2378         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2379         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2380         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2381         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2382     }
2383 }
2384
2385 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2386     MpegEncContext * const s = &h->s;
2387     int temp8, i;
2388     uint64_t temp64;
2389     int deblock_left = (s->mb_x > 0);
2390     int deblock_top  = (s->mb_y > 1);
2391
2392     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2393
2394     src_y  -= 2 *   linesize + 1;
2395     src_cb -= 2 * uvlinesize + 1;
2396     src_cr -= 2 * uvlinesize + 1;
2397
2398 #define XCHG(a,b,t,xchg)\
2399 t= a;\
2400 if(xchg)\
2401     a= b;\
2402 b= t;
2403
2404     if(deblock_left){
2405         for(i = (!deblock_top)<<1; i<34; i++){
2406             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2407         }
2408     }
2409
2410     if(deblock_top){
2411         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2412         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2413         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2414         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2415         if(s->mb_x+1 < s->mb_width){
2416             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2417             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2418         }
2419     }
2420
2421     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2422         if(deblock_left){
2423             for(i = (!deblock_top) << 1; i<18; i++){
2424                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2425                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2426             }
2427         }
2428         if(deblock_top){
2429             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2430             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2431             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2432             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2433         }
2434     }
2435 }
2436
2437 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2438     MpegEncContext * const s = &h->s;
2439     const int mb_x= s->mb_x;
2440     const int mb_y= s->mb_y;
2441     const int mb_xy= mb_x + mb_y*s->mb_stride;
2442     const int mb_type= s->current_picture.mb_type[mb_xy];
2443     uint8_t  *dest_y, *dest_cb, *dest_cr;
2444     int linesize, uvlinesize /*dct_offset*/;
2445     int i;
2446     int *block_offset = &h->block_offset[0];
2447     const unsigned int bottom = mb_y & 1;
2448     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2449     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2450     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2451
2452     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2453     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2454     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2455
2456     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2457     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2458
2459     if (!simple && MB_FIELD) {
2460         linesize   = h->mb_linesize   = s->linesize * 2;
2461         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2462         block_offset = &h->block_offset[24];
2463         if(mb_y&1){ //FIXME move out of this func?
2464             dest_y -= s->linesize*15;
2465             dest_cb-= s->uvlinesize*7;
2466             dest_cr-= s->uvlinesize*7;
2467         }
2468         if(FRAME_MBAFF) {
2469             int list;
2470             for(list=0; list<h->list_count; list++){
2471                 if(!USES_LIST(mb_type, list))
2472                     continue;
2473                 if(IS_16X16(mb_type)){
2474                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2475                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2476                 }else{
2477                     for(i=0; i<16; i+=4){
2478                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2479                         int ref = h->ref_cache[list][scan8[i]];
2480                         if(ref >= 0)
2481                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2482                     }
2483                 }
2484             }
2485         }
2486     } else {
2487         linesize   = h->mb_linesize   = s->linesize;
2488         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2489 //        dct_offset = s->linesize * 16;
2490     }
2491
2492     if(transform_bypass){
2493         idct_dc_add =
2494         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2495     }else if(IS_8x8DCT(mb_type)){
2496         idct_dc_add = s->dsp.h264_idct8_dc_add;
2497         idct_add = s->dsp.h264_idct8_add;
2498     }else{
2499         idct_dc_add = s->dsp.h264_idct_dc_add;
2500         idct_add = s->dsp.h264_idct_add;
2501     }
2502
2503     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2504        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2505         int mbt_y = mb_y&~1;
2506         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2507         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2508         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2509         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2510     }
2511
2512     if (!simple && IS_INTRA_PCM(mb_type)) {
2513         unsigned int x, y;
2514
2515         // The pixels are stored in h->mb array in the same order as levels,
2516         // copy them in output in the correct order.
2517         for(i=0; i<16; i++) {
2518             for (y=0; y<4; y++) {
2519                 for (x=0; x<4; x++) {
2520                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2521                 }
2522             }
2523         }
2524         for(i=16; i<16+4; i++) {
2525             for (y=0; y<4; y++) {
2526                 for (x=0; x<4; x++) {
2527                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2528                 }
2529             }
2530         }
2531         for(i=20; i<20+4; i++) {
2532             for (y=0; y<4; y++) {
2533                 for (x=0; x<4; x++) {
2534                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2535                 }
2536             }
2537         }
2538     } else {
2539         if(IS_INTRA(mb_type)){
2540             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2541                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2542
2543             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2544                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2545                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2546             }
2547
2548             if(IS_INTRA4x4(mb_type)){
2549                 if(simple || !s->encoding){
2550                     if(IS_8x8DCT(mb_type)){
2551                         for(i=0; i<16; i+=4){
2552                             uint8_t * const ptr= dest_y + block_offset[i];
2553                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2554                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2555                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2556                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2557                             if(nnz){
2558                                 if(nnz == 1 && h->mb[i*16])
2559                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2560                                 else
2561                                     idct_add(ptr, h->mb + i*16, linesize);
2562                             }
2563                         }
2564                     }else
2565                     for(i=0; i<16; i++){
2566                         uint8_t * const ptr= dest_y + block_offset[i];
2567                         uint8_t *topright;
2568                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2569                         int nnz, tr;
2570
2571                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2572                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2573                             assert(mb_y || linesize <= block_offset[i]);
2574                             if(!topright_avail){
2575                                 tr= ptr[3 - linesize]*0x01010101;
2576                                 topright= (uint8_t*) &tr;
2577                             }else
2578                                 topright= ptr + 4 - linesize;
2579                         }else
2580                             topright= NULL;
2581
2582                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2583                         nnz = h->non_zero_count_cache[ scan8[i] ];
2584                         if(nnz){
2585                             if(is_h264){
2586                                 if(nnz == 1 && h->mb[i*16])
2587                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2588                                 else
2589                                     idct_add(ptr, h->mb + i*16, linesize);
2590                             }else
2591                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2592                         }
2593                     }
2594                 }
2595             }else{
2596                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2597                 if(is_h264){
2598                     if(!transform_bypass)
2599                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2600                 }else
2601                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2602             }
2603             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2604                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2605         }else if(is_h264){
2606             hl_motion(h, dest_y, dest_cb, dest_cr,
2607                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2608                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2609                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2610         }
2611
2612
2613         if(!IS_INTRA4x4(mb_type)){
2614             if(is_h264){
2615                 if(IS_INTRA16x16(mb_type)){
2616                     for(i=0; i<16; i++){
2617                         if(h->non_zero_count_cache[ scan8[i] ])
2618                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2619                         else if(h->mb[i*16])
2620                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2621                     }
2622                 }else{
2623                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2624                     for(i=0; i<16; i+=di){
2625                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2626                         if(nnz){
2627                             if(nnz==1 && h->mb[i*16])
2628                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2629                             else
2630                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2631                         }
2632                     }
2633                 }
2634             }else{
2635                 for(i=0; i<16; i++){
2636                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2637                         uint8_t * const ptr= dest_y + block_offset[i];
2638                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2639                     }
2640                 }
2641             }
2642         }
2643
2644         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2645             uint8_t *dest[2] = {dest_cb, dest_cr};
2646             if(transform_bypass){
2647                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2648             }else{
2649                 idct_add = s->dsp.h264_idct_add;
2650                 idct_dc_add = s->dsp.h264_idct_dc_add;
2651                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2652                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2653             }
2654             if(is_h264){
2655                 for(i=16; i<16+8; i++){
2656                     if(h->non_zero_count_cache[ scan8[i] ])
2657                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2658                     else if(h->mb[i*16])
2659                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2660                 }
2661             }else{
2662                 for(i=16; i<16+8; i++){
2663                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2664                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2665                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2666                     }
2667                 }
2668             }
2669         }
2670     }
2671     if(h->deblocking_filter) {
2672         if (!simple && FRAME_MBAFF) {
2673             //FIXME try deblocking one mb at a time?
2674             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2675             const int mb_y = s->mb_y - 1;
2676             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2677             const int mb_xy= mb_x + mb_y*s->mb_stride;
2678             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2679             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2680             if (!bottom) return;
2681             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2682             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2683             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2684
2685             if(IS_INTRA(mb_type_top | mb_type_bottom))
2686                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2687
2688             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2689             // deblock a pair
2690             // top
2691             s->mb_y--;
2692             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2693             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2694             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2695             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2696             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2697             // bottom
2698             s->mb_y++;
2699             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2700             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2701             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2702             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2703             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2704         } else {
2705             tprintf(h->s.avctx, "call filter_mb\n");
2706             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2707             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2708             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2709         }
2710     }
2711 }
2712
2713 /**
2714  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2715  */
2716 static void hl_decode_mb_simple(H264Context *h){
2717     hl_decode_mb_internal(h, 1);
2718 }
2719
2720 /**
2721  * Process a macroblock; this handles edge cases, such as interlacing.
2722  */
2723 static void av_noinline hl_decode_mb_complex(H264Context *h){
2724     hl_decode_mb_internal(h, 0);
2725 }
2726
2727 static void hl_decode_mb(H264Context *h){
2728     MpegEncContext * const s = &h->s;
2729     const int mb_x= s->mb_x;
2730     const int mb_y= s->mb_y;
2731     const int mb_xy= mb_x + mb_y*s->mb_stride;
2732     const int mb_type= s->current_picture.mb_type[mb_xy];
2733     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2734
2735     if(!s->decode)
2736         return;
2737
2738     if (is_complex)
2739         hl_decode_mb_complex(h);
2740     else hl_decode_mb_simple(h);
2741 }
2742
2743 static void pic_as_field(Picture *pic, const int parity){
2744     int i;
2745     for (i = 0; i < 4; ++i) {
2746         if (parity == PICT_BOTTOM_FIELD)
2747             pic->data[i] += pic->linesize[i];
2748         pic->reference = parity;
2749         pic->linesize[i] *= 2;
2750     }
2751 }
2752
2753 static int split_field_copy(Picture *dest, Picture *src,
2754                             int parity, int id_add){
2755     int match = !!(src->reference & parity);
2756
2757     if (match) {
2758         *dest = *src;
2759         pic_as_field(dest, parity);
2760         dest->pic_id *= 2;
2761         dest->pic_id += id_add;
2762     }
2763
2764     return match;
2765 }
2766
2767 /**
2768  * Split one reference list into field parts, interleaving by parity
2769  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2770  * set to look at the actual start of data for that field.
2771  *
2772  * @param dest output list
2773  * @param dest_len maximum number of fields to put in dest
2774  * @param src the source reference list containing fields and/or field pairs
2775  *            (aka short_ref/long_ref, or
2776  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2777  * @param src_len number of Picture's in source (pairs and unmatched fields)
2778  * @param parity the parity of the picture being decoded/needing
2779  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2780  * @return number of fields placed in dest
2781  */
2782 static int split_field_half_ref_list(Picture *dest, int dest_len,
2783                                      Picture *src,  int src_len,  int parity){
2784     int same_parity   = 1;
2785     int same_i        = 0;
2786     int opp_i         = 0;
2787     int out_i;
2788     int field_output;
2789
2790     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2791         if (same_parity && same_i < src_len) {
2792             field_output = split_field_copy(dest + out_i, src + same_i,
2793                                             parity, 1);
2794             same_parity = !field_output;
2795             same_i++;
2796
2797         } else if (opp_i < src_len) {
2798             field_output = split_field_copy(dest + out_i, src + opp_i,
2799                                             PICT_FRAME - parity, 0);
2800             same_parity = field_output;
2801             opp_i++;
2802
2803         } else {
2804             break;
2805         }
2806     }
2807
2808     return out_i;
2809 }
2810
2811 /**
2812  * Split the reference frame list into a reference field list.
2813  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2814  * The input list contains both reference field pairs and
2815  * unmatched reference fields; it is ordered as spec describes
2816  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2817  * unmatched field pairs are also present. Conceptually this is equivalent
2818  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2819  *
2820  * @param dest output reference list where ordered fields are to be placed
2821  * @param dest_len max number of fields to place at dest
2822  * @param src source reference list, as described above
2823  * @param src_len number of pictures (pairs and unmatched fields) in src
2824  * @param parity parity of field being currently decoded
2825  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2826  * @param long_i index into src array that holds first long reference picture,
2827  *        or src_len if no long refs present.
2828  */
2829 static int split_field_ref_list(Picture *dest, int dest_len,
2830                                 Picture *src,  int src_len,
2831                                 int parity,    int long_i){
2832
2833     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2834     dest += i;
2835     dest_len -= i;
2836
2837     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2838                                    src_len - long_i, parity);
2839     return i;
2840 }
2841
2842 /**
2843  * fills the default_ref_list.
2844  */
2845 static int fill_default_ref_list(H264Context *h){
2846     MpegEncContext * const s = &h->s;
2847     int i;
2848     int smallest_poc_greater_than_current = -1;
2849     int structure_sel;
2850     Picture sorted_short_ref[32];
2851     Picture field_entry_list[2][32];
2852     Picture *frame_list[2];
2853
2854     if (FIELD_PICTURE) {
2855         structure_sel = PICT_FRAME;
2856         frame_list[0] = field_entry_list[0];
2857         frame_list[1] = field_entry_list[1];
2858     } else {
2859         structure_sel = 0;
2860         frame_list[0] = h->default_ref_list[0];
2861         frame_list[1] = h->default_ref_list[1];
2862     }
2863
2864     if(h->slice_type==FF_B_TYPE){
2865         int list;
2866         int len[2];
2867         int short_len[2];
2868         int out_i;
2869         int limit= INT_MIN;
2870
2871         /* sort frame according to poc in B slice */
2872         for(out_i=0; out_i<h->short_ref_count; out_i++){
2873             int best_i=INT_MIN;
2874             int best_poc=INT_MAX;
2875
2876             for(i=0; i<h->short_ref_count; i++){
2877                 const int poc= h->short_ref[i]->poc;
2878                 if(poc > limit && poc < best_poc){
2879                     best_poc= poc;
2880                     best_i= i;
2881                 }
2882             }
2883
2884             assert(best_i != INT_MIN);
2885
2886             limit= best_poc;
2887             sorted_short_ref[out_i]= *h->short_ref[best_i];
2888             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2889             if (-1 == smallest_poc_greater_than_current) {
2890                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2891                     smallest_poc_greater_than_current = out_i;
2892                 }
2893             }
2894         }
2895
2896         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2897
2898         // find the largest poc
2899         for(list=0; list<2; list++){
2900             int index = 0;
2901             int j= -99;
2902             int step= list ? -1 : 1;
2903
2904             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2905                 int sel;
2906                 while(j<0 || j>= h->short_ref_count){
2907                     if(j != -99 && step == (list ? -1 : 1))
2908                         return -1;
2909                     step = -step;
2910                     j= smallest_poc_greater_than_current + (step>>1);
2911                 }
2912                 sel = sorted_short_ref[j].reference | structure_sel;
2913                 if(sel != PICT_FRAME) continue;
2914                 frame_list[list][index  ]= sorted_short_ref[j];
2915                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2916             }
2917             short_len[list] = index;
2918
2919             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2920                 int sel;
2921                 if(h->long_ref[i] == NULL) continue;
2922                 sel = h->long_ref[i]->reference | structure_sel;
2923                 if(sel != PICT_FRAME) continue;
2924
2925                 frame_list[ list ][index  ]= *h->long_ref[i];
2926                 frame_list[ list ][index++].pic_id= i;
2927             }
2928             len[list] = index;
2929         }
2930
2931         for(list=0; list<2; list++){
2932             if (FIELD_PICTURE)
2933                 len[list] = split_field_ref_list(h->default_ref_list[list],
2934                                                  h->ref_count[list],
2935                                                  frame_list[list],
2936                                                  len[list],
2937                                                  s->picture_structure,
2938                                                  short_len[list]);
2939
2940             // swap the two first elements of L1 when L0 and L1 are identical
2941             if(list && len[0] > 1 && len[0] == len[1])
2942                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2943                     if(i == len[0]){
2944                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2945                         break;
2946                     }
2947
2948             if(len[list] < h->ref_count[ list ])
2949                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2950         }
2951
2952
2953     }else{
2954         int index=0;
2955         int short_len;
2956         for(i=0; i<h->short_ref_count; i++){
2957             int sel;
2958             sel = h->short_ref[i]->reference | structure_sel;
2959             if(sel != PICT_FRAME) continue;
2960             frame_list[0][index  ]= *h->short_ref[i];
2961             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2962         }
2963         short_len = index;
2964         for(i = 0; i < 16; i++){
2965             int sel;
2966             if(h->long_ref[i] == NULL) continue;
2967             sel = h->long_ref[i]->reference | structure_sel;
2968             if(sel != PICT_FRAME) continue;
2969             frame_list[0][index  ]= *h->long_ref[i];
2970             frame_list[0][index++].pic_id= i;
2971         }
2972
2973         if (FIELD_PICTURE)
2974             index = split_field_ref_list(h->default_ref_list[0],
2975                                          h->ref_count[0], frame_list[0],
2976                                          index, s->picture_structure,
2977                                          short_len);
2978
2979         if(index < h->ref_count[0])
2980             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2981     }
2982 #ifdef TRACE
2983     for (i=0; i<h->ref_count[0]; i++) {
2984         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2985     }
2986     if(h->slice_type==FF_B_TYPE){
2987         for (i=0; i<h->ref_count[1]; i++) {
2988             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2989         }
2990     }
2991 #endif
2992     return 0;
2993 }
2994
2995 static void print_short_term(H264Context *h);
2996 static void print_long_term(H264Context *h);
2997
2998 /**
2999  * Extract structure information about the picture described by pic_num in
3000  * the current decoding context (frame or field). Note that pic_num is
3001  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3002  * @param pic_num picture number for which to extract structure information
3003  * @param structure one of PICT_XXX describing structure of picture
3004  *                      with pic_num
3005  * @return frame number (short term) or long term index of picture
3006  *         described by pic_num
3007  */
3008 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3009     MpegEncContext * const s = &h->s;
3010
3011     *structure = s->picture_structure;
3012     if(FIELD_PICTURE){
3013         if (!(pic_num & 1))
3014             /* opposite field */
3015             *structure ^= PICT_FRAME;
3016         pic_num >>= 1;
3017     }
3018
3019     return pic_num;
3020 }
3021
3022 static int decode_ref_pic_list_reordering(H264Context *h){
3023     MpegEncContext * const s = &h->s;
3024     int list, index, pic_structure;
3025
3026     print_short_term(h);
3027     print_long_term(h);
3028     if(h->slice_type==FF_I_TYPE || h->slice_type==FF_SI_TYPE) return 0; //FIXME move before func
3029
3030     for(list=0; list<h->list_count; list++){
3031         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3032
3033         if(get_bits1(&s->gb)){
3034             int pred= h->curr_pic_num;
3035
3036             for(index=0; ; index++){
3037                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3038                 unsigned int pic_id;
3039                 int i;
3040                 Picture *ref = NULL;
3041
3042                 if(reordering_of_pic_nums_idc==3)
3043                     break;
3044
3045                 if(index >= h->ref_count[list]){
3046                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3047                     return -1;
3048                 }
3049
3050                 if(reordering_of_pic_nums_idc<3){
3051                     if(reordering_of_pic_nums_idc<2){
3052                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3053                         int frame_num;
3054
3055                         if(abs_diff_pic_num > h->max_pic_num){
3056                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3057                             return -1;
3058                         }
3059
3060                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3061                         else                                pred+= abs_diff_pic_num;
3062                         pred &= h->max_pic_num - 1;
3063
3064                         frame_num = pic_num_extract(h, pred, &pic_structure);
3065
3066                         for(i= h->short_ref_count-1; i>=0; i--){
3067                             ref = h->short_ref[i];
3068                             assert(ref->reference);
3069                             assert(!ref->long_ref);
3070                             if(ref->data[0] != NULL &&
3071                                    ref->frame_num == frame_num &&
3072                                    (ref->reference & pic_structure) &&
3073                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3074                                 break;
3075                         }
3076                         if(i>=0)
3077                             ref->pic_id= pred;
3078                     }else{
3079                         int long_idx;
3080                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3081
3082                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3083
3084                         if(long_idx>31){
3085                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3086                             return -1;
3087                         }
3088                         ref = h->long_ref[long_idx];
3089                         assert(!(ref && !ref->reference));
3090                         if(ref && (ref->reference & pic_structure)){
3091                             ref->pic_id= pic_id;
3092                             assert(ref->long_ref);
3093                             i=0;
3094                         }else{
3095                             i=-1;
3096                         }
3097                     }
3098
3099                     if (i < 0) {
3100                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3101                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3102                     } else {
3103                         for(i=index; i+1<h->ref_count[list]; i++){
3104                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3105                                 break;
3106                         }
3107                         for(; i > index; i--){
3108                             h->ref_list[list][i]= h->ref_list[list][i-1];
3109                         }
3110                         h->ref_list[list][index]= *ref;
3111                         if (FIELD_PICTURE){
3112                             pic_as_field(&h->ref_list[list][index], pic_structure);
3113                         }
3114                     }
3115                 }else{
3116                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3117                     return -1;
3118                 }
3119             }
3120         }
3121     }
3122     for(list=0; list<h->list_count; list++){
3123         for(index= 0; index < h->ref_count[list]; index++){
3124             if(!h->ref_list[list][index].data[0])
3125                 h->ref_list[list][index]= s->current_picture;
3126         }
3127     }
3128
3129     if(h->slice_type==FF_B_TYPE && !h->direct_spatial_mv_pred)
3130         direct_dist_scale_factor(h);
3131     direct_ref_list_init(h);
3132     return 0;
3133 }
3134
3135 static void fill_mbaff_ref_list(H264Context *h){
3136     int list, i, j;
3137     for(list=0; list<2; list++){ //FIXME try list_count
3138         for(i=0; i<h->ref_count[list]; i++){
3139             Picture *frame = &h->ref_list[list][i];
3140             Picture *field = &h->ref_list[list][16+2*i];
3141             field[0] = *frame;
3142             for(j=0; j<3; j++)
3143                 field[0].linesize[j] <<= 1;
3144             field[0].reference = PICT_TOP_FIELD;
3145             field[1] = field[0];
3146             for(j=0; j<3; j++)
3147                 field[1].data[j] += frame->linesize[j];
3148             field[1].reference = PICT_BOTTOM_FIELD;
3149
3150             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3151             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3152             for(j=0; j<2; j++){
3153                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3154                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3155             }
3156         }
3157     }
3158     for(j=0; j<h->ref_count[1]; j++){
3159         for(i=0; i<h->ref_count[0]; i++)
3160             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3161         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3162         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3163     }
3164 }
3165
3166 static int pred_weight_table(H264Context *h){
3167     MpegEncContext * const s = &h->s;
3168     int list, i;
3169     int luma_def, chroma_def;
3170
3171     h->use_weight= 0;
3172     h->use_weight_chroma= 0;
3173     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3174     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3175     luma_def = 1<<h->luma_log2_weight_denom;
3176     chroma_def = 1<<h->chroma_log2_weight_denom;
3177
3178     for(list=0; list<2; list++){
3179         for(i=0; i<h->ref_count[list]; i++){
3180             int luma_weight_flag, chroma_weight_flag;
3181
3182             luma_weight_flag= get_bits1(&s->gb);
3183             if(luma_weight_flag){
3184                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3185                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3186                 if(   h->luma_weight[list][i] != luma_def
3187                    || h->luma_offset[list][i] != 0)
3188                     h->use_weight= 1;
3189             }else{
3190                 h->luma_weight[list][i]= luma_def;
3191                 h->luma_offset[list][i]= 0;
3192             }
3193
3194             chroma_weight_flag= get_bits1(&s->gb);
3195             if(chroma_weight_flag){
3196                 int j;
3197                 for(j=0; j<2; j++){
3198                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3199                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3200                     if(   h->chroma_weight[list][i][j] != chroma_def
3201                        || h->chroma_offset[list][i][j] != 0)
3202                         h->use_weight_chroma= 1;
3203                 }
3204             }else{
3205                 int j;
3206                 for(j=0; j<2; j++){
3207                     h->chroma_weight[list][i][j]= chroma_def;
3208                     h->chroma_offset[list][i][j]= 0;
3209                 }
3210             }
3211         }
3212         if(h->slice_type != FF_B_TYPE) break;
3213     }
3214     h->use_weight= h->use_weight || h->use_weight_chroma;
3215     return 0;
3216 }
3217
3218 static void implicit_weight_table(H264Context *h){
3219     MpegEncContext * const s = &h->s;
3220     int ref0, ref1;
3221     int cur_poc = s->current_picture_ptr->poc;
3222
3223     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3224        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3225         h->use_weight= 0;
3226         h->use_weight_chroma= 0;
3227         return;
3228     }
3229
3230     h->use_weight= 2;
3231     h->use_weight_chroma= 2;
3232     h->luma_log2_weight_denom= 5;
3233     h->chroma_log2_weight_denom= 5;
3234
3235     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3236         int poc0 = h->ref_list[0][ref0].poc;
3237         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3238             int poc1 = h->ref_list[1][ref1].poc;
3239             int td = av_clip(poc1 - poc0, -128, 127);
3240             if(td){
3241                 int tb = av_clip(cur_poc - poc0, -128, 127);
3242                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3243                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3244                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3245                     h->implicit_weight[ref0][ref1] = 32;
3246                 else
3247                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3248             }else
3249                 h->implicit_weight[ref0][ref1] = 32;
3250         }
3251     }
3252 }
3253
3254 /**
3255  * Mark a picture as no longer needed for reference. The refmask
3256  * argument allows unreferencing of individual fields or the whole frame.
3257  * If the picture becomes entirely unreferenced, but is being held for
3258  * display purposes, it is marked as such.
3259  * @param refmask mask of fields to unreference; the mask is bitwise
3260  *                anded with the reference marking of pic
3261  * @return non-zero if pic becomes entirely unreferenced (except possibly
3262  *         for display purposes) zero if one of the fields remains in
3263  *         reference
3264  */
3265 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3266     int i;
3267     if (pic->reference &= refmask) {
3268         return 0;
3269     } else {
3270         if(pic == h->delayed_output_pic)
3271             pic->reference=DELAYED_PIC_REF;
3272         else{
3273             for(i = 0; h->delayed_pic[i]; i++)
3274                 if(pic == h->delayed_pic[i]){
3275                     pic->reference=DELAYED_PIC_REF;
3276                     break;
3277                 }
3278         }
3279         return 1;
3280     }
3281 }
3282
3283 /**
3284  * instantaneous decoder refresh.
3285  */
3286 static void idr(H264Context *h){
3287     int i;
3288
3289     for(i=0; i<16; i++){
3290         if (h->long_ref[i] != NULL) {
3291             unreference_pic(h, h->long_ref[i], 0);
3292             h->long_ref[i]= NULL;
3293         }
3294     }
3295     h->long_ref_count=0;
3296
3297     for(i=0; i<h->short_ref_count; i++){
3298         unreference_pic(h, h->short_ref[i], 0);
3299         h->short_ref[i]= NULL;
3300     }
3301     h->short_ref_count=0;
3302 }
3303
3304 /* forget old pics after a seek */
3305 static void flush_dpb(AVCodecContext *avctx){
3306     H264Context *h= avctx->priv_data;
3307     int i;
3308     for(i=0; i<16; i++) {
3309         if(h->delayed_pic[i])
3310             h->delayed_pic[i]->reference= 0;
3311         h->delayed_pic[i]= NULL;
3312     }
3313     if(h->delayed_output_pic)
3314         h->delayed_output_pic->reference= 0;
3315     h->delayed_output_pic= NULL;
3316     idr(h);
3317     if(h->s.current_picture_ptr)
3318         h->s.current_picture_ptr->reference= 0;
3319     h->s.first_field= 0;
3320     ff_mpeg_flush(avctx);
3321 }
3322
3323 /**
3324  * Find a Picture in the short term reference list by frame number.
3325  * @param frame_num frame number to search for
3326  * @param idx the index into h->short_ref where returned picture is found
3327  *            undefined if no picture found.
3328  * @return pointer to the found picture, or NULL if no pic with the provided
3329  *                 frame number is found
3330  */
3331 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3332     MpegEncContext * const s = &h->s;
3333     int i;
3334
3335     for(i=0; i<h->short_ref_count; i++){
3336         Picture *pic= h->short_ref[i];
3337         if(s->avctx->debug&FF_DEBUG_MMCO)
3338             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3339         if(pic->frame_num == frame_num) {
3340             *idx = i;
3341             return pic;
3342         }
3343     }
3344     return NULL;
3345 }
3346
3347 /**
3348  * Remove a picture from the short term reference list by its index in
3349  * that list.  This does no checking on the provided index; it is assumed
3350  * to be valid. Other list entries are shifted down.
3351  * @param i index into h->short_ref of picture to remove.
3352  */
3353 static void remove_short_at_index(H264Context *h, int i){
3354     assert(i > 0 && i < h->short_ref_count);
3355     h->short_ref[i]= NULL;
3356     if (--h->short_ref_count)
3357         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3358 }
3359
3360 /**
3361  *
3362  * @return the removed picture or NULL if an error occurs
3363  */
3364 static Picture * remove_short(H264Context *h, int frame_num){
3365     MpegEncContext * const s = &h->s;
3366     Picture *pic;
3367     int i;
3368
3369     if(s->avctx->debug&FF_DEBUG_MMCO)
3370         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3371
3372     pic = find_short(h, frame_num, &i);
3373     if (pic)
3374         remove_short_at_index(h, i);
3375
3376     return pic;
3377 }
3378
3379 /**
3380  * Remove a picture from the long term reference list by its index in
3381  * that list.  This does no checking on the provided index; it is assumed
3382  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3383  * @param i index into h->long_ref of picture to remove.
3384  */
3385 static void remove_long_at_index(H264Context *h, int i){
3386     h->long_ref[i]= NULL;
3387     h->long_ref_count--;
3388 }
3389
3390 /**
3391  *
3392  * @return the removed picture or NULL if an error occurs
3393  */
3394 static Picture * remove_long(H264Context *h, int i){
3395     Picture *pic;
3396
3397     pic= h->long_ref[i];
3398     if (pic)
3399         remove_long_at_index(h, i);
3400
3401     return pic;
3402 }
3403
3404 /**
3405  * print short term list
3406  */
3407 static void print_short_term(H264Context *h) {
3408     uint32_t i;
3409     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3410         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3411         for(i=0; i<h->short_ref_count; i++){
3412             Picture *pic= h->short_ref[i];
3413             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3414         }
3415     }
3416 }
3417
3418 /**
3419  * print long term list
3420  */
3421 static void print_long_term(H264Context *h) {
3422     uint32_t i;
3423     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3424         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3425         for(i = 0; i < 16; i++){
3426             Picture *pic= h->long_ref[i];
3427             if (pic) {
3428                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3429             }
3430         }
3431     }
3432 }
3433
3434 /**
3435  * Executes the reference picture marking (memory management control operations).
3436  */
3437 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3438     MpegEncContext * const s = &h->s;
3439     int i, j;
3440     int current_ref_assigned=0;
3441     Picture *pic;
3442
3443     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3444         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3445
3446     for(i=0; i<mmco_count; i++){
3447         int structure, frame_num, unref_pic;
3448         if(s->avctx->debug&FF_DEBUG_MMCO)
3449             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3450
3451         switch(mmco[i].opcode){
3452         case MMCO_SHORT2UNUSED:
3453             if(s->avctx->debug&FF_DEBUG_MMCO)
3454                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3455             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3456             pic = find_short(h, frame_num, &j);
3457             if (pic) {
3458                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3459                     remove_short_at_index(h, j);
3460             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3461                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3462             break;
3463         case MMCO_SHORT2LONG:
3464             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3465                     h->long_ref[mmco[i].long_arg]->frame_num ==
3466                                               mmco[i].short_pic_num / 2) {
3467                 /* do nothing, we've already moved this field pair. */
3468             } else {
3469                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3470
3471                 pic= remove_long(h, mmco[i].long_arg);
3472                 if(pic) unreference_pic(h, pic, 0);
3473
3474                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3475                 if (h->long_ref[ mmco[i].long_arg ]){
3476                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3477                     h->long_ref_count++;
3478                 }
3479             }
3480             break;
3481         case MMCO_LONG2UNUSED:
3482             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3483             pic = h->long_ref[j];
3484             if (pic) {
3485                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3486                     remove_long_at_index(h, j);
3487             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3488                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3489             break;
3490         case MMCO_LONG:
3491             unref_pic = 1;
3492             if (FIELD_PICTURE && !s->first_field) {
3493                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3494                     /* Just mark second field as referenced */
3495                     unref_pic = 0;
3496                 } else if (s->current_picture_ptr->reference) {
3497                     /* First field in pair is in short term list or
3498                      * at a different long term index.
3499                      * This is not allowed; see 7.4.3, notes 2 and 3.
3500                      * Report the problem and keep the pair where it is,
3501                      * and mark this field valid.
3502                      */
3503                     av_log(h->s.avctx, AV_LOG_ERROR,
3504                         "illegal long term reference assignment for second "
3505                         "field in complementary field pair (first field is "
3506                         "short term or has non-matching long index)\n");
3507                     unref_pic = 0;
3508                 }
3509             }
3510
3511             if (unref_pic) {
3512                 pic= remove_long(h, mmco[i].long_arg);
3513                 if(pic) unreference_pic(h, pic, 0);
3514
3515                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3516                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3517                 h->long_ref_count++;
3518             }
3519
3520             s->current_picture_ptr->reference |= s->picture_structure;
3521             current_ref_assigned=1;
3522             break;
3523         case MMCO_SET_MAX_LONG:
3524             assert(mmco[i].long_arg <= 16);
3525             // just remove the long term which index is greater than new max
3526             for(j = mmco[i].long_arg; j<16; j++){
3527                 pic = remove_long(h, j);
3528                 if (pic) unreference_pic(h, pic, 0);
3529             }
3530             break;
3531         case MMCO_RESET:
3532             while(h->short_ref_count){
3533                 pic= remove_short(h, h->short_ref[0]->frame_num);
3534                 if(pic) unreference_pic(h, pic, 0);
3535             }
3536             for(j = 0; j < 16; j++) {
3537                 pic= remove_long(h, j);
3538                 if(pic) unreference_pic(h, pic, 0);
3539             }
3540             break;
3541         default: assert(0);
3542         }
3543     }
3544
3545     if (!current_ref_assigned && FIELD_PICTURE &&
3546             !s->first_field && s->current_picture_ptr->reference) {
3547
3548         /* Second field of complementary field pair; the first field of
3549          * which is already referenced. If short referenced, it
3550          * should be first entry in short_ref. If not, it must exist
3551          * in long_ref; trying to put it on the short list here is an
3552          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3553          */
3554         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3555             /* Just mark the second field valid */
3556             s->current_picture_ptr->reference = PICT_FRAME;
3557         } else if (s->current_picture_ptr->long_ref) {
3558             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3559                                              "assignment for second field "
3560                                              "in complementary field pair "
3561                                              "(first field is long term)\n");
3562         } else {
3563             /*
3564              * First field in reference, but not in any sensible place on our
3565              * reference lists. This shouldn't happen unless reference
3566              * handling somewhere else is wrong.
3567              */
3568             assert(0);
3569         }
3570         current_ref_assigned = 1;
3571     }
3572
3573     if(!current_ref_assigned){
3574         pic= remove_short(h, s->current_picture_ptr->frame_num);
3575         if(pic){
3576             unreference_pic(h, pic, 0);
3577             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3578         }
3579
3580         if(h->short_ref_count)
3581             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3582
3583         h->short_ref[0]= s->current_picture_ptr;
3584         h->short_ref[0]->long_ref=0;
3585         h->short_ref_count++;
3586         s->current_picture_ptr->reference |= s->picture_structure;
3587     }
3588
3589     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3590
3591         /* We have too many reference frames, probably due to corrupted
3592          * stream. Need to discard one frame. Prevents overrun of the
3593          * short_ref and long_ref buffers.
3594          */
3595         av_log(h->s.avctx, AV_LOG_ERROR,
3596                "number of reference frames exceeds max (probably "
3597                "corrupt input), discarding one\n");
3598
3599         if (h->long_ref_count) {
3600             for (i = 0; i < 16; ++i)
3601                 if (h->long_ref[i])
3602                     break;
3603
3604             assert(i < 16);
3605             pic = h->long_ref[i];
3606             remove_long_at_index(h, i);
3607         } else {
3608             pic = h->short_ref[h->short_ref_count - 1];
3609             remove_short_at_index(h, h->short_ref_count - 1);
3610         }
3611         unreference_pic(h, pic, 0);
3612     }
3613
3614     print_short_term(h);
3615     print_long_term(h);
3616     return 0;
3617 }
3618
3619 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3620     MpegEncContext * const s = &h->s;
3621     int i;
3622
3623     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3624         s->broken_link= get_bits1(gb) -1;
3625         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3626         if(h->mmco[0].long_arg == -1)
3627             h->mmco_index= 0;
3628         else{
3629             h->mmco[0].opcode= MMCO_LONG;
3630             h->mmco_index= 1;
3631         }
3632     }else{
3633         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3634             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3635                 MMCOOpcode opcode= get_ue_golomb(gb);
3636
3637                 h->mmco[i].opcode= opcode;
3638                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3639                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3640 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3641                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3642                         return -1;
3643                     }*/
3644                 }
3645                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3646                     unsigned int long_arg= get_ue_golomb(gb);
3647                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3648                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3649                         return -1;
3650                     }
3651                     h->mmco[i].long_arg= long_arg;
3652                 }
3653
3654                 if(opcode > (unsigned)MMCO_LONG){
3655                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3656                     return -1;
3657                 }
3658                 if(opcode == MMCO_END)
3659                     break;
3660             }
3661             h->mmco_index= i;
3662         }else{
3663             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3664
3665             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3666                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3667                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3668                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3669                 h->mmco_index= 1;
3670                 if (FIELD_PICTURE) {
3671                     h->mmco[0].short_pic_num *= 2;
3672                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3673                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3674                     h->mmco_index= 2;
3675                 }
3676             }else
3677                 h->mmco_index= 0;
3678         }
3679     }
3680
3681     return 0;
3682 }
3683
3684 static int init_poc(H264Context *h){
3685     MpegEncContext * const s = &h->s;
3686     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3687     int field_poc[2];
3688
3689     if(h->nal_unit_type == NAL_IDR_SLICE){
3690         h->frame_num_offset= 0;
3691     }else{
3692         if(h->frame_num < h->prev_frame_num)
3693             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3694         else
3695             h->frame_num_offset= h->prev_frame_num_offset;
3696     }
3697
3698     if(h->sps.poc_type==0){
3699         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3700
3701         if(h->nal_unit_type == NAL_IDR_SLICE){
3702              h->prev_poc_msb=
3703              h->prev_poc_lsb= 0;
3704         }
3705
3706         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3707             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3708         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3709             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3710         else
3711             h->poc_msb = h->prev_poc_msb;
3712 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3713         field_poc[0] =
3714         field_poc[1] = h->poc_msb + h->poc_lsb;
3715         if(s->picture_structure == PICT_FRAME)
3716             field_poc[1] += h->delta_poc_bottom;
3717     }else if(h->sps.poc_type==1){
3718         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3719         int i;
3720
3721         if(h->sps.poc_cycle_length != 0)
3722             abs_frame_num = h->frame_num_offset + h->frame_num;
3723         else
3724             abs_frame_num = 0;
3725
3726         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3727             abs_frame_num--;
3728
3729         expected_delta_per_poc_cycle = 0;
3730         for(i=0; i < h->sps.poc_cycle_length; i++)
3731             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3732
3733         if(abs_frame_num > 0){
3734             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3735             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3736
3737             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3738             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3739                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3740         } else
3741             expectedpoc = 0;
3742
3743         if(h->nal_ref_idc == 0)
3744             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3745
3746         field_poc[0] = expectedpoc + h->delta_poc[0];
3747         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3748
3749         if(s->picture_structure == PICT_FRAME)
3750             field_poc[1] += h->delta_poc[1];
3751     }else{
3752         int poc;
3753         if(h->nal_unit_type == NAL_IDR_SLICE){
3754             poc= 0;
3755         }else{
3756             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3757             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3758         }
3759         field_poc[0]= poc;
3760         field_poc[1]= poc;
3761     }
3762
3763     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3764         s->current_picture_ptr->field_poc[0]= field_poc[0];
3765         s->current_picture_ptr->poc = field_poc[0];
3766     }
3767     if(s->picture_structure != PICT_TOP_FIELD) {
3768         s->current_picture_ptr->field_poc[1]= field_poc[1];
3769         s->current_picture_ptr->poc = field_poc[1];
3770     }
3771     if(!FIELD_PICTURE || !s->first_field) {
3772         Picture *cur = s->current_picture_ptr;
3773         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3774     }
3775
3776     return 0;
3777 }
3778
3779
3780 /**
3781  * initialize scan tables
3782  */
3783 static void init_scan_tables(H264Context *h){
3784     MpegEncContext * const s = &h->s;
3785     int i;
3786     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3787         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3788         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3789     }else{
3790         for(i=0; i<16; i++){
3791 #define T(x) (x>>2) | ((x<<2) & 0xF)
3792             h->zigzag_scan[i] = T(zigzag_scan[i]);
3793             h-> field_scan[i] = T( field_scan[i]);
3794 #undef T
3795         }
3796     }
3797     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3798         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3799         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3800         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3801         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3802     }else{
3803         for(i=0; i<64; i++){
3804 #define T(x) (x>>3) | ((x&7)<<3)
3805             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3806             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3807             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3808             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3809 #undef T
3810         }
3811     }
3812     if(h->sps.transform_bypass){ //FIXME same ugly
3813         h->zigzag_scan_q0          = zigzag_scan;
3814         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3815         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3816         h->field_scan_q0           = field_scan;
3817         h->field_scan8x8_q0        = field_scan8x8;
3818         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3819     }else{
3820         h->zigzag_scan_q0          = h->zigzag_scan;
3821         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3822         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3823         h->field_scan_q0           = h->field_scan;
3824         h->field_scan8x8_q0        = h->field_scan8x8;
3825         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3826     }
3827 }
3828
3829 /**
3830  * Replicates H264 "master" context to thread contexts.
3831  */
3832 static void clone_slice(H264Context *dst, H264Context *src)
3833 {
3834     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3835     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3836     dst->s.current_picture      = src->s.current_picture;
3837     dst->s.linesize             = src->s.linesize;
3838     dst->s.uvlinesize           = src->s.uvlinesize;
3839     dst->s.first_field          = src->s.first_field;
3840
3841     dst->prev_poc_msb           = src->prev_poc_msb;
3842     dst->prev_poc_lsb           = src->prev_poc_lsb;
3843     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3844     dst->prev_frame_num         = src->prev_frame_num;
3845     dst->short_ref_count        = src->short_ref_count;
3846
3847     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3848     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3849     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3850     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3851
3852     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3853     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3854 }
3855
3856 /**
3857  * decodes a slice header.
3858  * this will allso call MPV_common_init() and frame_start() as needed
3859  *
3860  * @param h h264context
3861  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3862  *
3863  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3864  */
3865 static int decode_slice_header(H264Context *h, H264Context *h0){
3866     MpegEncContext * const s = &h->s;
3867     MpegEncContext * const s0 = &h0->s;
3868     unsigned int first_mb_in_slice;
3869     unsigned int pps_id;
3870     int num_ref_idx_active_override_flag;
3871     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3872     unsigned int slice_type, tmp, i;
3873     int default_ref_list_done = 0;
3874     int last_pic_structure;
3875
3876     s->dropable= h->nal_ref_idc == 0;
3877
3878     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3879         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3880         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3881     }else{
3882         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3883         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3884     }
3885
3886     first_mb_in_slice= get_ue_golomb(&s->gb);
3887
3888     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3889         h0->current_slice = 0;
3890         if (!s0->first_field)
3891             s->current_picture_ptr= NULL;
3892     }
3893
3894     slice_type= get_ue_golomb(&s->gb);
3895     if(slice_type > 9){
3896         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3897         return -1;
3898     }
3899     if(slice_type > 4){
3900         slice_type -= 5;
3901         h->slice_type_fixed=1;
3902     }else
3903         h->slice_type_fixed=0;
3904
3905     slice_type= slice_type_map[ slice_type ];
3906     if (slice_type == FF_I_TYPE
3907         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3908         default_ref_list_done = 1;
3909     }
3910     h->slice_type= slice_type;
3911
3912     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3913     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3914         av_log(h->s.avctx, AV_LOG_ERROR,
3915                "B picture before any references, skipping\n");
3916         return -1;
3917     }
3918
3919     pps_id= get_ue_golomb(&s->gb);
3920     if(pps_id>=MAX_PPS_COUNT){
3921         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3922         return -1;
3923     }
3924     if(!h0->pps_buffers[pps_id]) {
3925         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3926         return -1;
3927     }
3928     h->pps= *h0->pps_buffers[pps_id];
3929
3930     if(!h0->sps_buffers[h->pps.sps_id]) {
3931         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3932         return -1;
3933     }
3934     h->sps = *h0->sps_buffers[h->pps.sps_id];
3935
3936     if(h == h0 && h->dequant_coeff_pps != pps_id){
3937         h->dequant_coeff_pps = pps_id;
3938         init_dequant_tables(h);
3939     }
3940
3941     s->mb_width= h->sps.mb_width;
3942     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3943
3944     h->b_stride=  s->mb_width*4;
3945     h->b8_stride= s->mb_width*2;
3946
3947     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3948     if(h->sps.frame_mbs_only_flag)
3949         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3950     else
3951         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3952
3953     if (s->context_initialized
3954         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3955         if(h != h0)
3956             return -1;   // width / height changed during parallelized decoding
3957         free_tables(h);
3958         MPV_common_end(s);
3959     }
3960     if (!s->context_initialized) {
3961         if(h != h0)
3962             return -1;  // we cant (re-)initialize context during parallel decoding
3963         if (MPV_common_init(s) < 0)
3964             return -1;
3965         s->first_field = 0;
3966
3967         init_scan_tables(h);
3968         alloc_tables(h);
3969
3970         for(i = 1; i < s->avctx->thread_count; i++) {
3971             H264Context *c;
3972             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3973             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3974             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3975             c->sps = h->sps;
3976             c->pps = h->pps;
3977             init_scan_tables(c);
3978             clone_tables(c, h);
3979         }
3980
3981         for(i = 0; i < s->avctx->thread_count; i++)
3982             if(context_init(h->thread_context[i]) < 0)
3983                 return -1;
3984
3985         s->avctx->width = s->width;
3986         s->avctx->height = s->height;
3987         s->avctx->sample_aspect_ratio= h->sps.sar;
3988         if(!s->avctx->sample_aspect_ratio.den)
3989             s->avctx->sample_aspect_ratio.den = 1;
3990
3991         if(h->sps.timing_info_present_flag){
3992             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3993             if(h->x264_build > 0 && h->x264_build < 44)
3994                 s->avctx->time_base.den *= 2;
3995             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3996                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3997         }
3998     }
3999
4000     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4001
4002     h->mb_mbaff = 0;
4003     h->mb_aff_frame = 0;
4004     last_pic_structure = s0->picture_structure;
4005     if(h->sps.frame_mbs_only_flag){
4006         s->picture_structure= PICT_FRAME;
4007     }else{
4008         if(get_bits1(&s->gb)) { //field_pic_flag
4009             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4010         } else {
4011             s->picture_structure= PICT_FRAME;
4012             h->mb_aff_frame = h->sps.mb_aff;
4013         }
4014     }
4015
4016     if(h0->current_slice == 0){
4017         /* See if we have a decoded first field looking for a pair... */
4018         if (s0->first_field) {
4019             assert(s0->current_picture_ptr);
4020             assert(s0->current_picture_ptr->data[0]);
4021             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4022
4023             /* figure out if we have a complementary field pair */
4024             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4025                 /*
4026                  * Previous field is unmatched. Don't display it, but let it
4027                  * remain for reference if marked as such.
4028                  */
4029                 s0->current_picture_ptr = NULL;
4030                 s0->first_field = FIELD_PICTURE;
4031
4032             } else {
4033                 if (h->nal_ref_idc &&
4034                         s0->current_picture_ptr->reference &&
4035                         s0->current_picture_ptr->frame_num != h->frame_num) {
4036                     /*
4037                      * This and previous field were reference, but had
4038                      * different frame_nums. Consider this field first in
4039                      * pair. Throw away previous field except for reference
4040                      * purposes.
4041                      */
4042                     s0->first_field = 1;
4043                     s0->current_picture_ptr = NULL;
4044
4045                 } else {
4046                     /* Second field in complementary pair */
4047                     s0->first_field = 0;
4048                 }
4049             }
4050
4051         } else {
4052             /* Frame or first field in a potentially complementary pair */
4053             assert(!s0->current_picture_ptr);
4054             s0->first_field = FIELD_PICTURE;
4055         }
4056
4057         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4058             s0->first_field = 0;
4059             return -1;
4060         }
4061     }
4062     if(h != h0)
4063         clone_slice(h, h0);
4064
4065     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4066
4067     assert(s->mb_num == s->mb_width * s->mb_height);
4068     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4069        first_mb_in_slice                    >= s->mb_num){
4070         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4071         return -1;
4072     }
4073     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4074     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4075     if (s->picture_structure == PICT_BOTTOM_FIELD)
4076         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4077     assert(s->mb_y < s->mb_height);
4078
4079     if(s->picture_structure==PICT_FRAME){
4080         h->curr_pic_num=   h->frame_num;
4081         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4082     }else{
4083         h->curr_pic_num= 2*h->frame_num + 1;
4084         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4085     }
4086
4087     if(h->nal_unit_type == NAL_IDR_SLICE){
4088         get_ue_golomb(&s->gb); /* idr_pic_id */
4089     }
4090
4091     if(h->sps.poc_type==0){
4092         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4093
4094         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4095             h->delta_poc_bottom= get_se_golomb(&s->gb);
4096         }
4097     }
4098
4099     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4100         h->delta_poc[0]= get_se_golomb(&s->gb);
4101
4102         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4103             h->delta_poc[1]= get_se_golomb(&s->gb);
4104     }
4105
4106     init_poc(h);
4107
4108     if(h->pps.redundant_pic_cnt_present){
4109         h->redundant_pic_count= get_ue_golomb(&s->gb);
4110     }
4111
4112     //set defaults, might be overriden a few line later
4113     h->ref_count[0]= h->pps.ref_count[0];
4114     h->ref_count[1]= h->pps.ref_count[1];
4115
4116     if(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE || h->slice_type == FF_B_TYPE){
4117         if(h->slice_type == FF_B_TYPE){
4118             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4119         }
4120         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4121
4122         if(num_ref_idx_active_override_flag){
4123             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4124             if(h->slice_type==FF_B_TYPE)
4125                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4126
4127             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4128                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4129                 h->ref_count[0]= h->ref_count[1]= 1;
4130                 return -1;
4131             }
4132         }
4133         if(h->slice_type == FF_B_TYPE)
4134             h->list_count= 2;
4135         else
4136             h->list_count= 1;
4137     }else
4138         h->list_count= 0;
4139
4140     if(!default_ref_list_done){
4141         fill_default_ref_list(h);
4142     }
4143
4144     if(decode_ref_pic_list_reordering(h) < 0)
4145         return -1;
4146
4147     if(   (h->pps.weighted_pred          && (h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE ))
4148        || (h->pps.weighted_bipred_idc==1 && h->slice_type==FF_B_TYPE ) )
4149         pred_weight_table(h);
4150     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==FF_B_TYPE)
4151         implicit_weight_table(h);
4152     else
4153         h->use_weight = 0;
4154
4155     if(h->nal_ref_idc)
4156         decode_ref_pic_marking(h0, &s->gb);
4157
4158     if(FRAME_MBAFF)
4159         fill_mbaff_ref_list(h);
4160
4161     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE && h->pps.cabac ){
4162         tmp = get_ue_golomb(&s->gb);
4163         if(tmp > 2){
4164             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4165             return -1;
4166         }
4167         h->cabac_init_idc= tmp;
4168     }
4169
4170     h->last_qscale_diff = 0;
4171     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4172     if(tmp>51){
4173         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4174         return -1;
4175     }
4176     s->qscale= tmp;
4177     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4178     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4179     //FIXME qscale / qp ... stuff
4180     if(h->slice_type == FF_SP_TYPE){
4181         get_bits1(&s->gb); /* sp_for_switch_flag */
4182     }
4183     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4184         get_se_golomb(&s->gb); /* slice_qs_delta */
4185     }
4186
4187     h->deblocking_filter = 1;
4188     h->slice_alpha_c0_offset = 0;
4189     h->slice_beta_offset = 0;
4190     if( h->pps.deblocking_filter_parameters_present ) {
4191         tmp= get_ue_golomb(&s->gb);
4192         if(tmp > 2){
4193             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4194             return -1;
4195         }
4196         h->deblocking_filter= tmp;
4197         if(h->deblocking_filter < 2)
4198             h->deblocking_filter^= 1; // 1<->0
4199
4200         if( h->deblocking_filter ) {
4201             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4202             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4203         }
4204     }
4205
4206     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4207        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != FF_I_TYPE)
4208        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == FF_B_TYPE)
4209        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4210         h->deblocking_filter= 0;
4211
4212     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4213         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4214             /* Cheat slightly for speed:
4215                Do not bother to deblock across slices. */
4216             h->deblocking_filter = 2;
4217         } else {
4218             h0->max_contexts = 1;
4219             if(!h0->single_decode_warning) {
4220                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4221                 h0->single_decode_warning = 1;
4222             }
4223             if(h != h0)
4224                 return 1; // deblocking switched inside frame
4225         }
4226     }
4227
4228 #if 0 //FMO
4229     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4230         slice_group_change_cycle= get_bits(&s->gb, ?);
4231 #endif
4232
4233     h0->last_slice_type = slice_type;
4234     h->slice_num = ++h0->current_slice;
4235
4236     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4237     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4238
4239     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4240         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4241                h->slice_num,
4242                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4243                first_mb_in_slice,
4244                av_get_pict_type_char(h->slice_type),
4245                pps_id, h->frame_num,
4246                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4247                h->ref_count[0], h->ref_count[1],
4248                s->qscale,
4249                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4250                h->use_weight,
4251                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4252                );
4253     }
4254
4255     return 0;
4256 }
4257
4258 /**
4259  *
4260  */
4261 static inline int get_level_prefix(GetBitContext *gb){
4262     unsigned int buf;
4263     int log;
4264
4265     OPEN_READER(re, gb);
4266     UPDATE_CACHE(re, gb);
4267     buf=GET_CACHE(re, gb);
4268
4269     log= 32 - av_log2(buf);
4270 #ifdef TRACE
4271     print_bin(buf>>(32-log), log);
4272     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4273 #endif
4274
4275     LAST_SKIP_BITS(re, gb, log);
4276     CLOSE_READER(re, gb);
4277
4278     return log-1;
4279 }
4280
4281 static inline int get_dct8x8_allowed(H264Context *h){
4282     int i;
4283     for(i=0; i<4; i++){
4284         if(!IS_SUB_8X8(h->sub_mb_type[i])
4285            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4286             return 0;
4287     }
4288     return 1;
4289 }
4290
4291 /**
4292  * decodes a residual block.
4293  * @param n block index
4294  * @param scantable scantable
4295  * @param max_coeff number of coefficients in the block
4296  * @return <0 if an error occurred
4297  */
4298 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4299     MpegEncContext * const s = &h->s;
4300     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4301     int level[16];
4302     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4303
4304     //FIXME put trailing_onex into the context
4305
4306     if(n == CHROMA_DC_BLOCK_INDEX){
4307         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4308         total_coeff= coeff_token>>2;
4309     }else{
4310         if(n == LUMA_DC_BLOCK_INDEX){
4311             total_coeff= pred_non_zero_count(h, 0);
4312             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4313             total_coeff= coeff_token>>2;
4314         }else{
4315             total_coeff= pred_non_zero_count(h, n);
4316             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4317             total_coeff= coeff_token>>2;
4318             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4319         }
4320     }
4321
4322     //FIXME set last_non_zero?
4323
4324     if(total_coeff==0)
4325         return 0;
4326     if(total_coeff > (unsigned)max_coeff) {
4327         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4328         return -1;
4329     }
4330
4331     trailing_ones= coeff_token&3;
4332     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4333     assert(total_coeff<=16);
4334
4335     for(i=0; i<trailing_ones; i++){
4336         level[i]= 1 - 2*get_bits1(gb);
4337     }
4338
4339     if(i<total_coeff) {
4340         int level_code, mask;
4341         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4342         int prefix= get_level_prefix(gb);
4343
4344         //first coefficient has suffix_length equal to 0 or 1
4345         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4346             if(suffix_length)
4347                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4348             else
4349                 level_code= (prefix<<suffix_length); //part
4350         }else if(prefix==14){
4351             if(suffix_length)
4352                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4353             else
4354                 level_code= prefix + get_bits(gb, 4); //part
4355         }else if(prefix==15){
4356             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4357             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4358         }else{
4359             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4360             return -1;
4361         }
4362
4363         if(trailing_ones < 3) level_code += 2;
4364
4365         suffix_length = 1;
4366         if(level_code > 5)
4367             suffix_length++;
4368         mask= -(level_code&1);
4369         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4370         i++;
4371
4372         //remaining coefficients have suffix_length > 0
4373         for(;i<total_coeff;i++) {
4374             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4375             prefix = get_level_prefix(gb);
4376             if(prefix<15){
4377                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4378             }else if(prefix==15){
4379                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4380             }else{
4381                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4382                 return -1;
4383             }
4384             mask= -(level_code&1);
4385             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4386             if(level_code > suffix_limit[suffix_length])
4387                 suffix_length++;
4388         }
4389     }
4390
4391     if(total_coeff == max_coeff)
4392         zeros_left=0;
4393     else{
4394         if(n == CHROMA_DC_BLOCK_INDEX)
4395             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4396         else
4397             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4398     }
4399
4400     coeff_num = zeros_left + total_coeff - 1;
4401     j = scantable[coeff_num];
4402     if(n > 24){
4403         block[j] = level[0];
4404         for(i=1;i<total_coeff;i++) {
4405             if(zeros_left <= 0)
4406                 run_before = 0;
4407             else if(zeros_left < 7){
4408                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4409             }else{
4410                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4411             }
4412             zeros_left -= run_before;
4413             coeff_num -= 1 + run_before;
4414             j= scantable[ coeff_num ];
4415
4416             block[j]= level[i];
4417         }
4418     }else{
4419         block[j] = (level[0] * qmul[j] + 32)>>6;
4420         for(i=1;i<total_coeff;i++) {
4421             if(zeros_left <= 0)
4422                 run_before = 0;
4423             else if(zeros_left < 7){
4424                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4425             }else{
4426                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4427             }
4428             zeros_left -= run_before;
4429             coeff_num -= 1 + run_before;
4430             j= scantable[ coeff_num ];
4431
4432             block[j]= (level[i] * qmul[j] + 32)>>6;
4433         }
4434     }
4435
4436     if(zeros_left<0){
4437         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4438         return -1;
4439     }
4440
4441     return 0;
4442 }
4443
4444 static void predict_field_decoding_flag(H264Context *h){
4445     MpegEncContext * const s = &h->s;
4446     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4447     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4448                 ? s->current_picture.mb_type[mb_xy-1]
4449                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4450                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4451                 : 0;
4452     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4453 }
4454
4455 /**
4456  * decodes a P_SKIP or B_SKIP macroblock
4457  */
4458 static void decode_mb_skip(H264Context *h){
4459     MpegEncContext * const s = &h->s;
4460     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4461     int mb_type=0;
4462
4463     memset(h->non_zero_count[mb_xy], 0, 16);
4464     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4465
4466     if(MB_FIELD)
4467         mb_type|= MB_TYPE_INTERLACED;
4468
4469     if( h->slice_type == FF_B_TYPE )
4470     {
4471         // just for fill_caches. pred_direct_motion will set the real mb_type
4472         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4473
4474         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4475         pred_direct_motion(h, &mb_type);
4476         mb_type|= MB_TYPE_SKIP;
4477     }
4478     else
4479     {
4480         int mx, my;
4481         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4482
4483         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4484         pred_pskip_motion(h, &mx, &my);
4485         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4486         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4487     }
4488
4489     write_back_motion(h, mb_type);
4490     s->current_picture.mb_type[mb_xy]= mb_type;
4491     s->current_picture.qscale_table[mb_xy]= s->qscale;
4492     h->slice_table[ mb_xy ]= h->slice_num;
4493     h->prev_mb_skipped= 1;
4494 }
4495
4496 /**
4497  * decodes a macroblock
4498  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4499  */
4500 static int decode_mb_cavlc(H264Context *h){
4501     MpegEncContext * const s = &h->s;
4502     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4503     int partition_count;
4504     unsigned int mb_type, cbp;
4505     int dct8x8_allowed= h->pps.transform_8x8_mode;
4506
4507     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4508
4509     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4510     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4511                 down the code */
4512     if(h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE){
4513         if(s->mb_skip_run==-1)
4514             s->mb_skip_run= get_ue_golomb(&s->gb);
4515
4516         if (s->mb_skip_run--) {
4517             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4518                 if(s->mb_skip_run==0)
4519                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4520                 else
4521                     predict_field_decoding_flag(h);
4522             }
4523             decode_mb_skip(h);
4524             return 0;
4525         }
4526     }
4527     if(FRAME_MBAFF){
4528         if( (s->mb_y&1) == 0 )
4529             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4530     }else
4531         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4532
4533     h->prev_mb_skipped= 0;
4534
4535     mb_type= get_ue_golomb(&s->gb);
4536     if(h->slice_type == FF_B_TYPE){
4537         if(mb_type < 23){
4538             partition_count= b_mb_type_info[mb_type].partition_count;
4539             mb_type=         b_mb_type_info[mb_type].type;
4540         }else{
4541             mb_type -= 23;
4542             goto decode_intra_mb;
4543         }
4544     }else if(h->slice_type == FF_P_TYPE /*|| h->slice_type == FF_SP_TYPE */){
4545         if(mb_type < 5){
4546             partition_count= p_mb_type_info[mb_type].partition_count;
4547             mb_type=         p_mb_type_info[mb_type].type;
4548         }else{
4549             mb_type -= 5;
4550             goto decode_intra_mb;
4551         }
4552     }else{
4553        assert(h->slice_type == FF_I_TYPE);
4554 decode_intra_mb:
4555         if(mb_type > 25){
4556             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4557             return -1;
4558         }
4559         partition_count=0;
4560         cbp= i_mb_type_info[mb_type].cbp;
4561         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4562         mb_type= i_mb_type_info[mb_type].type;
4563     }
4564
4565     if(MB_FIELD)
4566         mb_type |= MB_TYPE_INTERLACED;
4567
4568     h->slice_table[ mb_xy ]= h->slice_num;
4569
4570     if(IS_INTRA_PCM(mb_type)){
4571         unsigned int x, y;
4572
4573         // We assume these blocks are very rare so we do not optimize it.
4574         align_get_bits(&s->gb);
4575
4576         // The pixels are stored in the same order as levels in h->mb array.
4577         for(y=0; y<16; y++){
4578             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4579             for(x=0; x<16; x++){
4580                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4581                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4582             }
4583         }
4584         for(y=0; y<8; y++){
4585             const int index= 256 + 4*(y&3) + 32*(y>>2);
4586             for(x=0; x<8; x++){
4587                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4588                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4589             }
4590         }
4591         for(y=0; y<8; y++){
4592             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4593             for(x=0; x<8; x++){
4594                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4595                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4596             }
4597         }
4598
4599         // In deblocking, the quantizer is 0
4600         s->current_picture.qscale_table[mb_xy]= 0;
4601         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4602         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4603         // All coeffs are present
4604         memset(h->non_zero_count[mb_xy], 16, 16);
4605
4606         s->current_picture.mb_type[mb_xy]= mb_type;
4607         return 0;
4608     }
4609
4610     if(MB_MBAFF){
4611         h->ref_count[0] <<= 1;
4612         h->ref_count[1] <<= 1;
4613     }
4614
4615     fill_caches(h, mb_type, 0);
4616
4617     //mb_pred
4618     if(IS_INTRA(mb_type)){
4619             int pred_mode;
4620 //            init_top_left_availability(h);
4621             if(IS_INTRA4x4(mb_type)){
4622                 int i;
4623                 int di = 1;
4624                 if(dct8x8_allowed && get_bits1(&s->gb)){
4625                     mb_type |= MB_TYPE_8x8DCT;
4626                     di = 4;
4627                 }
4628
4629 //                fill_intra4x4_pred_table(h);
4630                 for(i=0; i<16; i+=di){
4631                     int mode= pred_intra_mode(h, i);
4632
4633                     if(!get_bits1(&s->gb)){
4634                         const int rem_mode= get_bits(&s->gb, 3);
4635                         mode = rem_mode + (rem_mode >= mode);
4636                     }
4637
4638                     if(di==4)
4639                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4640                     else
4641                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4642                 }
4643                 write_back_intra_pred_mode(h);
4644                 if( check_intra4x4_pred_mode(h) < 0)
4645                     return -1;
4646             }else{
4647                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4648                 if(h->intra16x16_pred_mode < 0)
4649                     return -1;
4650             }
4651
4652             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4653             if(pred_mode < 0)
4654                 return -1;
4655             h->chroma_pred_mode= pred_mode;
4656     }else if(partition_count==4){
4657         int i, j, sub_partition_count[4], list, ref[2][4];
4658
4659         if(h->slice_type == FF_B_TYPE){
4660             for(i=0; i<4; i++){
4661                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4662                 if(h->sub_mb_type[i] >=13){
4663                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4664                     return -1;
4665                 }
4666                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4667                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4668             }
4669             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4670                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4671                 pred_direct_motion(h, &mb_type);
4672                 h->ref_cache[0][scan8[4]] =
4673                 h->ref_cache[1][scan8[4]] =
4674                 h->ref_cache[0][scan8[12]] =
4675                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4676             }
4677         }else{
4678             assert(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE); //FIXME SP correct ?
4679             for(i=0; i<4; i++){
4680                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4681                 if(h->sub_mb_type[i] >=4){
4682                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4683                     return -1;
4684                 }
4685                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4686                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4687             }
4688         }
4689
4690         for(list=0; list<h->list_count; list++){
4691             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4692             for(i=0; i<4; i++){
4693                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4694                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4695                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4696                     if(tmp>=ref_count){
4697                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4698                         return -1;
4699                     }
4700                     ref[list][i]= tmp;
4701                 }else{
4702                  //FIXME
4703                     ref[list][i] = -1;
4704                 }
4705             }
4706         }
4707
4708         if(dct8x8_allowed)
4709             dct8x8_allowed = get_dct8x8_allowed(h);
4710
4711         for(list=0; list<h->list_count; list++){
4712             for(i=0; i<4; i++){
4713                 if(IS_DIRECT(h->sub_mb_type[i])) {
4714                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4715                     continue;
4716                 }
4717                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4718                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4719
4720                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4721                     const int sub_mb_type= h->sub_mb_type[i];
4722                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4723                     for(j=0; j<sub_partition_count[i]; j++){
4724                         int mx, my;
4725                         const int index= 4*i + block_width*j;
4726                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4727                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4728                         mx += get_se_golomb(&s->gb);
4729                         my += get_se_golomb(&s->gb);
4730                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4731
4732                         if(IS_SUB_8X8(sub_mb_type)){
4733                             mv_cache[ 1 ][0]=
4734                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4735                             mv_cache[ 1 ][1]=
4736                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4737                         }else if(IS_SUB_8X4(sub_mb_type)){
4738                             mv_cache[ 1 ][0]= mx;
4739                             mv_cache[ 1 ][1]= my;
4740                         }else if(IS_SUB_4X8(sub_mb_type)){
4741                             mv_cache[ 8 ][0]= mx;
4742                             mv_cache[ 8 ][1]= my;
4743                         }
4744                         mv_cache[ 0 ][0]= mx;
4745                         mv_cache[ 0 ][1]= my;
4746                     }
4747                 }else{
4748                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4749                     p[0] = p[1]=
4750                     p[8] = p[9]= 0;
4751                 }
4752             }
4753         }
4754     }else if(IS_DIRECT(mb_type)){
4755         pred_direct_motion(h, &mb_type);
4756         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4757     }else{
4758         int list, mx, my, i;
4759          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4760         if(IS_16X16(mb_type)){
4761             for(list=0; list<h->list_count; list++){
4762                     unsigned int val;
4763                     if(IS_DIR(mb_type, 0, list)){
4764                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4765                         if(val >= h->ref_count[list]){
4766                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4767                             return -1;
4768                         }
4769                     }else
4770                         val= LIST_NOT_USED&0xFF;
4771                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4772             }
4773             for(list=0; list<h->list_count; list++){
4774                 unsigned int val;
4775                 if(IS_DIR(mb_type, 0, list)){
4776                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4777                     mx += get_se_golomb(&s->gb);
4778                     my += get_se_golomb(&s->gb);
4779                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4780
4781                     val= pack16to32(mx,my);
4782                 }else
4783                     val=0;
4784                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4785             }
4786         }
4787         else if(IS_16X8(mb_type)){
4788             for(list=0; list<h->list_count; list++){
4789                     for(i=0; i<2; i++){
4790                         unsigned int val;
4791                         if(IS_DIR(mb_type, i, list)){
4792                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4793                             if(val >= h->ref_count[list]){
4794                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4795                                 return -1;
4796                             }
4797                         }else
4798                             val= LIST_NOT_USED&0xFF;
4799                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4800                     }
4801             }
4802             for(list=0; list<h->list_count; list++){
4803                 for(i=0; i<2; i++){
4804                     unsigned int val;
4805                     if(IS_DIR(mb_type, i, list)){
4806                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4807                         mx += get_se_golomb(&s->gb);
4808                         my += get_se_golomb(&s->gb);
4809                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4810
4811                         val= pack16to32(mx,my);
4812                     }else
4813                         val=0;
4814                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4815                 }
4816             }
4817         }else{
4818             assert(IS_8X16(mb_type));
4819             for(list=0; list<h->list_count; list++){
4820                     for(i=0; i<2; i++){
4821                         unsigned int val;
4822                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4823                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4824                             if(val >= h->ref_count[list]){
4825                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4826                                 return -1;
4827                             }
4828                         }else
4829                             val= LIST_NOT_USED&0xFF;
4830                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4831                     }
4832             }
4833             for(list=0; list<h->list_count; list++){
4834                 for(i=0; i<2; i++){
4835                     unsigned int val;
4836                     if(IS_DIR(mb_type, i, list)){
4837                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4838                         mx += get_se_golomb(&s->gb);
4839                         my += get_se_golomb(&s->gb);
4840                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4841
4842                         val= pack16to32(mx,my);
4843                     }else
4844                         val=0;
4845                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4846                 }
4847             }
4848         }
4849     }
4850
4851     if(IS_INTER(mb_type))
4852         write_back_motion(h, mb_type);
4853
4854     if(!IS_INTRA16x16(mb_type)){
4855         cbp= get_ue_golomb(&s->gb);
4856         if(cbp > 47){
4857             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4858             return -1;
4859         }
4860
4861         if(IS_INTRA4x4(mb_type))
4862             cbp= golomb_to_intra4x4_cbp[cbp];
4863         else
4864             cbp= golomb_to_inter_cbp[cbp];
4865     }
4866     h->cbp = cbp;
4867
4868     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4869         if(get_bits1(&s->gb))
4870             mb_type |= MB_TYPE_8x8DCT;
4871     }
4872     s->current_picture.mb_type[mb_xy]= mb_type;
4873
4874     if(cbp || IS_INTRA16x16(mb_type)){
4875         int i8x8, i4x4, chroma_idx;
4876         int dquant;
4877         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4878         const uint8_t *scan, *scan8x8, *dc_scan;
4879
4880 //        fill_non_zero_count_cache(h);
4881
4882         if(IS_INTERLACED(mb_type)){
4883             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4884             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4885             dc_scan= luma_dc_field_scan;
4886         }else{
4887             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4888             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4889             dc_scan= luma_dc_zigzag_scan;
4890         }
4891
4892         dquant= get_se_golomb(&s->gb);
4893
4894         if( dquant > 25 || dquant < -26 ){
4895             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4896             return -1;
4897         }
4898
4899         s->qscale += dquant;
4900         if(((unsigned)s->qscale) > 51){
4901             if(s->qscale<0) s->qscale+= 52;
4902             else            s->qscale-= 52;
4903         }
4904
4905         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4906         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4907         if(IS_INTRA16x16(mb_type)){
4908             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4909                 return -1; //FIXME continue if partitioned and other return -1 too
4910             }
4911
4912             assert((cbp&15) == 0 || (cbp&15) == 15);
4913
4914             if(cbp&15){
4915                 for(i8x8=0; i8x8<4; i8x8++){
4916                     for(i4x4=0; i4x4<4; i4x4++){
4917                         const int index= i4x4 + 4*i8x8;
4918                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4919                             return -1;
4920                         }
4921                     }
4922                 }
4923             }else{
4924                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4925             }
4926         }else{
4927             for(i8x8=0; i8x8<4; i8x8++){
4928                 if(cbp & (1<<i8x8)){
4929                     if(IS_8x8DCT(mb_type)){
4930                         DCTELEM *buf = &h->mb[64*i8x8];
4931                         uint8_t *nnz;
4932                         for(i4x4=0; i4x4<4; i4x4++){
4933                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4934                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4935                                 return -1;
4936                         }
4937                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4938                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4939                     }else{
4940                         for(i4x4=0; i4x4<4; i4x4++){
4941                             const int index= i4x4 + 4*i8x8;
4942
4943                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4944                                 return -1;
4945                             }
4946                         }
4947                     }
4948                 }else{
4949                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4950                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4951                 }
4952             }
4953         }
4954
4955         if(cbp&0x30){
4956             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4957                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4958                     return -1;
4959                 }
4960         }
4961
4962         if(cbp&0x20){
4963             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4964                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4965                 for(i4x4=0; i4x4<4; i4x4++){
4966                     const int index= 16 + 4*chroma_idx + i4x4;
4967                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4968                         return -1;
4969                     }
4970                 }
4971             }
4972         }else{
4973             uint8_t * const nnz= &h->non_zero_count_cache[0];
4974             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4975             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4976         }
4977     }else{
4978         uint8_t * const nnz= &h->non_zero_count_cache[0];
4979         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4980         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4981         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4982     }
4983     s->current_picture.qscale_table[mb_xy]= s->qscale;
4984     write_back_non_zero_count(h);
4985
4986     if(MB_MBAFF){
4987         h->ref_count[0] >>= 1;
4988         h->ref_count[1] >>= 1;
4989     }
4990
4991     return 0;
4992 }
4993
4994 static int decode_cabac_field_decoding_flag(H264Context *h) {
4995     MpegEncContext * const s = &h->s;
4996     const int mb_x = s->mb_x;
4997     const int mb_y = s->mb_y & ~1;
4998     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4999     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5000
5001     unsigned int ctx = 0;
5002
5003     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5004         ctx += 1;
5005     }
5006     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5007         ctx += 1;
5008     }
5009
5010     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5011 }
5012
5013 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5014     uint8_t *state= &h->cabac_state[ctx_base];
5015     int mb_type;
5016
5017     if(intra_slice){
5018         MpegEncContext * const s = &h->s;
5019         const int mba_xy = h->left_mb_xy[0];
5020         const int mbb_xy = h->top_mb_xy;
5021         int ctx=0;
5022         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5023             ctx++;
5024         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5025             ctx++;
5026         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5027             return 0;   /* I4x4 */
5028         state += 2;
5029     }else{
5030         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5031             return 0;   /* I4x4 */
5032     }
5033
5034     if( get_cabac_terminate( &h->cabac ) )
5035         return 25;  /* PCM */
5036
5037     mb_type = 1; /* I16x16 */
5038     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5039     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5040         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5041     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5042     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5043     return mb_type;
5044 }
5045
5046 static int decode_cabac_mb_type( H264Context *h ) {
5047     MpegEncContext * const s = &h->s;
5048
5049     if( h->slice_type == FF_I_TYPE ) {
5050         return decode_cabac_intra_mb_type(h, 3, 1);
5051     } else if( h->slice_type == FF_P_TYPE ) {
5052         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5053             /* P-type */
5054             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5055                 /* P_L0_D16x16, P_8x8 */
5056                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5057             } else {
5058                 /* P_L0_D8x16, P_L0_D16x8 */
5059                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5060             }
5061         } else {
5062             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5063         }
5064     } else if( h->slice_type == FF_B_TYPE ) {
5065         const int mba_xy = h->left_mb_xy[0];
5066         const int mbb_xy = h->top_mb_xy;
5067         int ctx = 0;
5068         int bits;
5069
5070         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5071             ctx++;
5072         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5073             ctx++;
5074
5075         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5076             return 0; /* B_Direct_16x16 */
5077
5078         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5079             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5080         }
5081
5082         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5083         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5084         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5085         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5086         if( bits < 8 )
5087             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5088         else if( bits == 13 ) {
5089             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5090         } else if( bits == 14 )
5091             return 11; /* B_L1_L0_8x16 */
5092         else if( bits == 15 )
5093             return 22; /* B_8x8 */
5094
5095         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5096         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5097     } else {
5098         /* TODO SI/SP frames? */
5099         return -1;
5100     }
5101 }
5102
5103 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5104     MpegEncContext * const s = &h->s;
5105     int mba_xy, mbb_xy;
5106     int ctx = 0;
5107
5108     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5109         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5110         mba_xy = mb_xy - 1;
5111         if( (mb_y&1)
5112             && h->slice_table[mba_xy] == h->slice_num
5113             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5114             mba_xy += s->mb_stride;
5115         if( MB_FIELD ){
5116             mbb_xy = mb_xy - s->mb_stride;
5117             if( !(mb_y&1)
5118                 && h->slice_table[mbb_xy] == h->slice_num
5119                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5120                 mbb_xy -= s->mb_stride;
5121         }else
5122             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5123     }else{
5124         int mb_xy = mb_x + mb_y*s->mb_stride;
5125         mba_xy = mb_xy - 1;
5126         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5127     }
5128
5129     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5130         ctx++;
5131     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5132         ctx++;
5133
5134     if( h->slice_type == FF_B_TYPE )
5135         ctx += 13;
5136     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5137 }
5138
5139 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5140     int mode = 0;
5141
5142     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5143         return pred_mode;
5144
5145     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5146     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5147     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5148
5149     if( mode >= pred_mode )
5150         return mode + 1;
5151     else
5152         return mode;
5153 }
5154
5155 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5156     const int mba_xy = h->left_mb_xy[0];
5157     const int mbb_xy = h->top_mb_xy;
5158
5159     int ctx = 0;
5160
5161     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5162     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5163         ctx++;
5164
5165     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5166         ctx++;
5167
5168     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5169         return 0;
5170
5171     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5172         return 1;
5173     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5174         return 2;
5175     else
5176         return 3;
5177 }
5178
5179 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5180     int cbp_b, cbp_a, ctx, cbp = 0;
5181
5182     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5183     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5184
5185     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5186     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5187     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5188     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5189     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5190     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5191     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5192     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5193     return cbp;
5194 }
5195 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5196     int ctx;
5197     int cbp_a, cbp_b;
5198
5199     cbp_a = (h->left_cbp>>4)&0x03;
5200     cbp_b = (h-> top_cbp>>4)&0x03;
5201
5202     ctx = 0;
5203     if( cbp_a > 0 ) ctx++;
5204     if( cbp_b > 0 ) ctx += 2;
5205     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5206         return 0;
5207
5208     ctx = 4;
5209     if( cbp_a == 2 ) ctx++;
5210     if( cbp_b == 2 ) ctx += 2;
5211     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5212 }
5213 static int decode_cabac_mb_dqp( H264Context *h) {
5214     int   ctx = 0;
5215     int   val = 0;
5216
5217     if( h->last_qscale_diff != 0 )
5218         ctx++;
5219
5220     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5221         if( ctx < 2 )
5222             ctx = 2;
5223         else
5224             ctx = 3;
5225         val++;
5226         if(val > 102) //prevent infinite loop
5227             return INT_MIN;
5228     }
5229
5230     if( val&0x01 )
5231         return (val + 1)/2;
5232     else
5233         return -(val + 1)/2;
5234 }
5235 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5236     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5237         return 0;   /* 8x8 */
5238     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5239         return 1;   /* 8x4 */
5240     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5241         return 2;   /* 4x8 */
5242     return 3;       /* 4x4 */
5243 }
5244 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5245     int type;
5246     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5247         return 0;   /* B_Direct_8x8 */
5248     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5249         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5250     type = 3;
5251     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5252         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5253             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5254         type += 4;
5255     }
5256     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5257     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5258     return type;
5259 }
5260
5261 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5262     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5263 }
5264
5265 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5266     int refa = h->ref_cache[list][scan8[n] - 1];
5267     int refb = h->ref_cache[list][scan8[n] - 8];
5268     int ref  = 0;
5269     int ctx  = 0;
5270
5271     if( h->slice_type == FF_B_TYPE) {
5272         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5273             ctx++;
5274         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5275             ctx += 2;
5276     } else {
5277         if( refa > 0 )
5278             ctx++;
5279         if( refb > 0 )
5280             ctx += 2;
5281     }
5282
5283     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5284         ref++;
5285         if( ctx < 4 )
5286             ctx = 4;
5287         else
5288             ctx = 5;
5289         if(ref >= 32 /*h->ref_list[list]*/){
5290             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5291             return 0; //FIXME we should return -1 and check the return everywhere
5292         }
5293     }
5294     return ref;
5295 }
5296
5297 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5298     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5299                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5300     int ctxbase = (l == 0) ? 40 : 47;
5301     int ctx, mvd;
5302
5303     if( amvd < 3 )
5304         ctx = 0;
5305     else if( amvd > 32 )
5306         ctx = 2;
5307     else
5308         ctx = 1;
5309
5310     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5311         return 0;
5312
5313     mvd= 1;
5314     ctx= 3;
5315     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5316         mvd++;
5317         if( ctx < 6 )
5318             ctx++;
5319     }
5320
5321     if( mvd >= 9 ) {
5322         int k = 3;
5323         while( get_cabac_bypass( &h->cabac ) ) {
5324             mvd += 1 << k;
5325             k++;
5326             if(k>24){
5327                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5328                 return INT_MIN;
5329             }
5330         }
5331         while( k-- ) {
5332             if( get_cabac_bypass( &h->cabac ) )
5333                 mvd += 1 << k;
5334         }
5335     }
5336     return get_cabac_bypass_sign( &h->cabac, -mvd );
5337 }
5338
5339 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5340     int nza, nzb;
5341     int ctx = 0;
5342
5343     if( cat == 0 ) {
5344         nza = h->left_cbp&0x100;
5345         nzb = h-> top_cbp&0x100;
5346     } else if( cat == 1 || cat == 2 ) {
5347         nza = h->non_zero_count_cache[scan8[idx] - 1];
5348         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5349     } else if( cat == 3 ) {
5350         nza = (h->left_cbp>>(6+idx))&0x01;
5351         nzb = (h-> top_cbp>>(6+idx))&0x01;
5352     } else {
5353         assert(cat == 4);
5354         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5355         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5356     }
5357
5358     if( nza > 0 )
5359         ctx++;
5360
5361     if( nzb > 0 )
5362         ctx += 2;
5363
5364     return ctx + 4 * cat;
5365 }
5366
5367 DECLARE_ASM_CONST(1, const uint8_t, last_coeff_flag_offset_8x8[63]) = {
5368     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5369     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5370     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5371     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5372 };
5373
5374 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5375     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5376     static const int significant_coeff_flag_offset[2][6] = {
5377       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5378       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5379     };
5380     static const int last_coeff_flag_offset[2][6] = {
5381       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5382       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5383     };
5384     static const int coeff_abs_level_m1_offset[6] = {
5385         227+0, 227+10, 227+20, 227+30, 227+39, 426
5386     };
5387     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5388       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5389         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5390         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5391        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5392       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5393         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5394         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5395         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5396     };
5397
5398     int index[64];
5399
5400     int av_unused last;
5401     int coeff_count = 0;
5402
5403     int abslevel1 = 1;
5404     int abslevelgt1 = 0;
5405
5406     uint8_t *significant_coeff_ctx_base;
5407     uint8_t *last_coeff_ctx_base;
5408     uint8_t *abs_level_m1_ctx_base;
5409
5410 #ifndef ARCH_X86
5411 #define CABAC_ON_STACK
5412 #endif
5413 #ifdef CABAC_ON_STACK
5414 #define CC &cc
5415     CABACContext cc;
5416     cc.range     = h->cabac.range;
5417     cc.low       = h->cabac.low;
5418     cc.bytestream= h->cabac.bytestream;
5419 #else
5420 #define CC &h->cabac
5421 #endif
5422
5423
5424     /* cat: 0-> DC 16x16  n = 0
5425      *      1-> AC 16x16  n = luma4x4idx
5426      *      2-> Luma4x4   n = luma4x4idx
5427      *      3-> DC Chroma n = iCbCr
5428      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5429      *      5-> Luma8x8   n = 4 * luma8x8idx
5430      */
5431
5432     /* read coded block flag */
5433     if( cat != 5 ) {
5434         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5435             if( cat == 1 || cat == 2 )
5436                 h->non_zero_count_cache[scan8[n]] = 0;
5437             else if( cat == 4 )
5438                 h->non_zero_count_cache[scan8[16+n]] = 0;
5439 #ifdef CABAC_ON_STACK
5440             h->cabac.range     = cc.range     ;
5441             h->cabac.low       = cc.low       ;
5442             h->cabac.bytestream= cc.bytestream;
5443 #endif
5444             return;
5445         }
5446     }
5447
5448     significant_coeff_ctx_base = h->cabac_state
5449         + significant_coeff_flag_offset[MB_FIELD][cat];
5450     last_coeff_ctx_base = h->cabac_state
5451         + last_coeff_flag_offset[MB_FIELD][cat];
5452     abs_level_m1_ctx_base = h->cabac_state
5453         + coeff_abs_level_m1_offset[cat];
5454
5455     if( cat == 5 ) {
5456 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5457         for(last= 0; last < coefs; last++) { \
5458             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5459             if( get_cabac( CC, sig_ctx )) { \
5460                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5461                 index[coeff_count++] = last; \
5462                 if( get_cabac( CC, last_ctx ) ) { \
5463                     last= max_coeff; \
5464                     break; \
5465                 } \
5466             } \
5467         }\
5468         if( last == max_coeff -1 ) {\
5469             index[coeff_count++] = last;\
5470         }
5471         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5472 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5473         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5474     } else {
5475         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5476 #else
5477         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5478     } else {
5479         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5480 #endif
5481     }
5482     assert(coeff_count > 0);
5483
5484     if( cat == 0 )
5485         h->cbp_table[mb_xy] |= 0x100;
5486     else if( cat == 1 || cat == 2 )
5487         h->non_zero_count_cache[scan8[n]] = coeff_count;
5488     else if( cat == 3 )
5489         h->cbp_table[mb_xy] |= 0x40 << n;
5490     else if( cat == 4 )
5491         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5492     else {
5493         assert( cat == 5 );
5494         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5495     }
5496
5497     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5498         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5499         int j= scantable[index[coeff_count]];
5500
5501         if( get_cabac( CC, ctx ) == 0 ) {
5502             if( !qmul ) {
5503                 block[j] = get_cabac_bypass_sign( CC, -1);
5504             }else{
5505                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5506             }
5507
5508             abslevel1++;
5509         } else {
5510             int coeff_abs = 2;
5511             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5512             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5513                 coeff_abs++;
5514             }
5515
5516             if( coeff_abs >= 15 ) {
5517                 int j = 0;
5518                 while( get_cabac_bypass( CC ) ) {
5519                     j++;
5520                 }
5521
5522                 coeff_abs=1;
5523                 while( j-- ) {
5524                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5525                 }
5526                 coeff_abs+= 14;
5527             }
5528
5529             if( !qmul ) {
5530                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5531                 else                                block[j] =  coeff_abs;
5532             }else{
5533                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5534                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5535             }
5536
5537             abslevelgt1++;
5538         }
5539     }
5540 #ifdef CABAC_ON_STACK
5541             h->cabac.range     = cc.range     ;
5542             h->cabac.low       = cc.low       ;
5543             h->cabac.bytestream= cc.bytestream;
5544 #endif
5545
5546 }
5547
5548 static inline void compute_mb_neighbors(H264Context *h)
5549 {
5550     MpegEncContext * const s = &h->s;
5551     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5552     h->top_mb_xy     = mb_xy - s->mb_stride;
5553     h->left_mb_xy[0] = mb_xy - 1;
5554     if(FRAME_MBAFF){
5555         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5556         const int top_pair_xy      = pair_xy     - s->mb_stride;
5557         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5558         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5559         const int curr_mb_frame_flag = !MB_FIELD;
5560         const int bottom = (s->mb_y & 1);
5561         if (bottom
5562                 ? !curr_mb_frame_flag // bottom macroblock
5563                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5564                 ) {
5565             h->top_mb_xy -= s->mb_stride;
5566         }
5567         if (left_mb_frame_flag != curr_mb_frame_flag) {
5568             h->left_mb_xy[0] = pair_xy - 1;
5569         }
5570     } else if (FIELD_PICTURE) {
5571         h->top_mb_xy -= s->mb_stride;
5572     }
5573     return;
5574 }
5575
5576 /**
5577  * decodes a macroblock
5578  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5579  */
5580 static int decode_mb_cabac(H264Context *h) {
5581     MpegEncContext * const s = &h->s;
5582     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5583     int mb_type, partition_count, cbp = 0;
5584     int dct8x8_allowed= h->pps.transform_8x8_mode;
5585
5586     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5587
5588     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5589     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE ) {
5590         int skip;
5591         /* a skipped mb needs the aff flag from the following mb */
5592         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5593             predict_field_decoding_flag(h);
5594         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5595             skip = h->next_mb_skipped;
5596         else
5597             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5598         /* read skip flags */
5599         if( skip ) {
5600             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5601                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5602                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5603                 if(h->next_mb_skipped)
5604                     predict_field_decoding_flag(h);
5605                 else
5606                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5607             }
5608
5609             decode_mb_skip(h);
5610
5611             h->cbp_table[mb_xy] = 0;
5612             h->chroma_pred_mode_table[mb_xy] = 0;
5613             h->last_qscale_diff = 0;
5614
5615             return 0;
5616
5617         }
5618     }
5619     if(FRAME_MBAFF){
5620         if( (s->mb_y&1) == 0 )
5621             h->mb_mbaff =
5622             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5623     }else
5624         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5625
5626     h->prev_mb_skipped = 0;
5627
5628     compute_mb_neighbors(h);
5629     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5630         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5631         return -1;
5632     }
5633
5634     if( h->slice_type == FF_B_TYPE ) {
5635         if( mb_type < 23 ){
5636             partition_count= b_mb_type_info[mb_type].partition_count;
5637             mb_type=         b_mb_type_info[mb_type].type;
5638         }else{
5639             mb_type -= 23;
5640             goto decode_intra_mb;
5641         }
5642     } else if( h->slice_type == FF_P_TYPE ) {
5643         if( mb_type < 5) {
5644             partition_count= p_mb_type_info[mb_type].partition_count;
5645             mb_type=         p_mb_type_info[mb_type].type;
5646         } else {
5647             mb_type -= 5;
5648             goto decode_intra_mb;
5649         }
5650     } else {
5651        assert(h->slice_type == FF_I_TYPE);
5652 decode_intra_mb:
5653         partition_count = 0;
5654         cbp= i_mb_type_info[mb_type].cbp;
5655         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5656         mb_type= i_mb_type_info[mb_type].type;
5657     }
5658     if(MB_FIELD)
5659         mb_type |= MB_TYPE_INTERLACED;
5660
5661     h->slice_table[ mb_xy ]= h->slice_num;
5662
5663     if(IS_INTRA_PCM(mb_type)) {
5664         const uint8_t *ptr;
5665         unsigned int x, y;
5666
5667         // We assume these blocks are very rare so we do not optimize it.
5668         // FIXME The two following lines get the bitstream position in the cabac
5669         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5670         ptr= h->cabac.bytestream;
5671         if(h->cabac.low&0x1) ptr--;
5672         if(CABAC_BITS==16){
5673             if(h->cabac.low&0x1FF) ptr--;
5674         }
5675
5676         // The pixels are stored in the same order as levels in h->mb array.
5677         for(y=0; y<16; y++){
5678             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5679             for(x=0; x<16; x++){
5680                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5681                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5682             }
5683         }
5684         for(y=0; y<8; y++){
5685             const int index= 256 + 4*(y&3) + 32*(y>>2);
5686             for(x=0; x<8; x++){
5687                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5688                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5689             }
5690         }
5691         for(y=0; y<8; y++){
5692             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5693             for(x=0; x<8; x++){
5694                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5695                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5696             }
5697         }
5698
5699         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5700
5701         // All blocks are present
5702         h->cbp_table[mb_xy] = 0x1ef;
5703         h->chroma_pred_mode_table[mb_xy] = 0;
5704         // In deblocking, the quantizer is 0
5705         s->current_picture.qscale_table[mb_xy]= 0;
5706         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5707         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5708         // All coeffs are present
5709         memset(h->non_zero_count[mb_xy], 16, 16);
5710         s->current_picture.mb_type[mb_xy]= mb_type;
5711         return 0;
5712     }
5713
5714     if(MB_MBAFF){
5715         h->ref_count[0] <<= 1;
5716         h->ref_count[1] <<= 1;
5717     }
5718
5719     fill_caches(h, mb_type, 0);
5720
5721     if( IS_INTRA( mb_type ) ) {
5722         int i, pred_mode;
5723         if( IS_INTRA4x4( mb_type ) ) {
5724             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5725                 mb_type |= MB_TYPE_8x8DCT;
5726                 for( i = 0; i < 16; i+=4 ) {
5727                     int pred = pred_intra_mode( h, i );
5728                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5729                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5730                 }
5731             } else {
5732                 for( i = 0; i < 16; i++ ) {
5733                     int pred = pred_intra_mode( h, i );
5734                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5735
5736                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5737                 }
5738             }
5739             write_back_intra_pred_mode(h);
5740             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5741         } else {
5742             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5743             if( h->intra16x16_pred_mode < 0 ) return -1;
5744         }
5745         h->chroma_pred_mode_table[mb_xy] =
5746         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5747
5748         pred_mode= check_intra_pred_mode( h, pred_mode );
5749         if( pred_mode < 0 ) return -1;
5750         h->chroma_pred_mode= pred_mode;
5751     } else if( partition_count == 4 ) {
5752         int i, j, sub_partition_count[4], list, ref[2][4];
5753
5754         if( h->slice_type == FF_B_TYPE ) {
5755             for( i = 0; i < 4; i++ ) {
5756                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5757                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5758                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5759             }
5760             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5761                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5762                 pred_direct_motion(h, &mb_type);
5763                 h->ref_cache[0][scan8[4]] =
5764                 h->ref_cache[1][scan8[4]] =
5765                 h->ref_cache[0][scan8[12]] =
5766                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5767                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5768                     for( i = 0; i < 4; i++ )
5769                         if( IS_DIRECT(h->sub_mb_type[i]) )
5770                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5771                 }
5772             }
5773         } else {
5774             for( i = 0; i < 4; i++ ) {
5775                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5776                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5777                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5778             }
5779         }
5780
5781         for( list = 0; list < h->list_count; list++ ) {
5782                 for( i = 0; i < 4; i++ ) {
5783                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5784                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5785                         if( h->ref_count[list] > 1 )
5786                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5787                         else
5788                             ref[list][i] = 0;
5789                     } else {
5790                         ref[list][i] = -1;
5791                     }
5792                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5793                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5794                 }
5795         }
5796
5797         if(dct8x8_allowed)
5798             dct8x8_allowed = get_dct8x8_allowed(h);
5799
5800         for(list=0; list<h->list_count; list++){
5801             for(i=0; i<4; i++){
5802                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5803                 if(IS_DIRECT(h->sub_mb_type[i])){
5804                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5805                     continue;
5806                 }
5807
5808                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5809                     const int sub_mb_type= h->sub_mb_type[i];
5810                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5811                     for(j=0; j<sub_partition_count[i]; j++){
5812                         int mpx, mpy;
5813                         int mx, my;
5814                         const int index= 4*i + block_width*j;
5815                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5816                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5817                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5818
5819                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5820                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5821                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5822
5823                         if(IS_SUB_8X8(sub_mb_type)){
5824                             mv_cache[ 1 ][0]=
5825                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5826                             mv_cache[ 1 ][1]=
5827                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5828
5829                             mvd_cache[ 1 ][0]=
5830                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5831                             mvd_cache[ 1 ][1]=
5832                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5833                         }else if(IS_SUB_8X4(sub_mb_type)){
5834                             mv_cache[ 1 ][0]= mx;
5835                             mv_cache[ 1 ][1]= my;
5836
5837                             mvd_cache[ 1 ][0]= mx - mpx;
5838                             mvd_cache[ 1 ][1]= my - mpy;
5839                         }else if(IS_SUB_4X8(sub_mb_type)){
5840                             mv_cache[ 8 ][0]= mx;
5841                             mv_cache[ 8 ][1]= my;
5842
5843                             mvd_cache[ 8 ][0]= mx - mpx;
5844                             mvd_cache[ 8 ][1]= my - mpy;
5845                         }
5846                         mv_cache[ 0 ][0]= mx;
5847                         mv_cache[ 0 ][1]= my;
5848
5849                         mvd_cache[ 0 ][0]= mx - mpx;
5850                         mvd_cache[ 0 ][1]= my - mpy;
5851                     }
5852                 }else{
5853                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5854                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5855                     p[0] = p[1] = p[8] = p[9] = 0;
5856                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5857                 }
5858             }
5859         }
5860     } else if( IS_DIRECT(mb_type) ) {
5861         pred_direct_motion(h, &mb_type);
5862         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5863         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5864         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5865     } else {
5866         int list, mx, my, i, mpx, mpy;
5867         if(IS_16X16(mb_type)){
5868             for(list=0; list<h->list_count; list++){
5869                 if(IS_DIR(mb_type, 0, list)){
5870                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5871                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5872                 }else
5873                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5874             }
5875             for(list=0; list<h->list_count; list++){
5876                 if(IS_DIR(mb_type, 0, list)){
5877                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5878
5879                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5880                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5881                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5882
5883                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5884                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5885                 }else
5886                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5887             }
5888         }
5889         else if(IS_16X8(mb_type)){
5890             for(list=0; list<h->list_count; list++){
5891                     for(i=0; i<2; i++){
5892                         if(IS_DIR(mb_type, i, list)){
5893                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5894                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5895                         }else
5896                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5897                     }
5898             }
5899             for(list=0; list<h->list_count; list++){
5900                 for(i=0; i<2; i++){
5901                     if(IS_DIR(mb_type, i, list)){
5902                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5903                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5904                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5905                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5906
5907                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5908                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5909                     }else{
5910                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5911                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5912                     }
5913                 }
5914             }
5915         }else{
5916             assert(IS_8X16(mb_type));
5917             for(list=0; list<h->list_count; list++){
5918                     for(i=0; i<2; i++){
5919                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5920                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5921                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5922                         }else
5923                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5924                     }
5925             }
5926             for(list=0; list<h->list_count; list++){
5927                 for(i=0; i<2; i++){
5928                     if(IS_DIR(mb_type, i, list)){
5929                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5930                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5931                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5932
5933                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5934                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5935                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5936                     }else{
5937                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5938                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5939                     }
5940                 }
5941             }
5942         }
5943     }
5944
5945    if( IS_INTER( mb_type ) ) {
5946         h->chroma_pred_mode_table[mb_xy] = 0;
5947         write_back_motion( h, mb_type );
5948    }
5949
5950     if( !IS_INTRA16x16( mb_type ) ) {
5951         cbp  = decode_cabac_mb_cbp_luma( h );
5952         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5953     }
5954
5955     h->cbp_table[mb_xy] = h->cbp = cbp;
5956
5957     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5958         if( decode_cabac_mb_transform_size( h ) )
5959             mb_type |= MB_TYPE_8x8DCT;
5960     }
5961     s->current_picture.mb_type[mb_xy]= mb_type;
5962
5963     if( cbp || IS_INTRA16x16( mb_type ) ) {
5964         const uint8_t *scan, *scan8x8, *dc_scan;
5965         const uint32_t *qmul;
5966         int dqp;
5967
5968         if(IS_INTERLACED(mb_type)){
5969             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5970             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5971             dc_scan= luma_dc_field_scan;
5972         }else{
5973             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5974             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5975             dc_scan= luma_dc_zigzag_scan;
5976         }
5977
5978         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5979         if( dqp == INT_MIN ){
5980             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5981             return -1;
5982         }
5983         s->qscale += dqp;
5984         if(((unsigned)s->qscale) > 51){
5985             if(s->qscale<0) s->qscale+= 52;
5986             else            s->qscale-= 52;
5987         }
5988         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5989         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5990
5991         if( IS_INTRA16x16( mb_type ) ) {
5992             int i;
5993             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5994             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5995
5996             if( cbp&15 ) {
5997                 qmul = h->dequant4_coeff[0][s->qscale];
5998                 for( i = 0; i < 16; i++ ) {
5999                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6000                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6001                 }
6002             } else {
6003                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6004             }
6005         } else {
6006             int i8x8, i4x4;
6007             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6008                 if( cbp & (1<<i8x8) ) {
6009                     if( IS_8x8DCT(mb_type) ) {
6010                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6011                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6012                     } else {
6013                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6014                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6015                             const int index = 4*i8x8 + i4x4;
6016                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6017 //START_TIMER
6018                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6019 //STOP_TIMER("decode_residual")
6020                         }
6021                     }
6022                 } else {
6023                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6024                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6025                 }
6026             }
6027         }
6028
6029         if( cbp&0x30 ){
6030             int c;
6031             for( c = 0; c < 2; c++ ) {
6032                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6033                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6034             }
6035         }
6036
6037         if( cbp&0x20 ) {
6038             int c, i;
6039             for( c = 0; c < 2; c++ ) {
6040                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6041                 for( i = 0; i < 4; i++ ) {
6042                     const int index = 16 + 4 * c + i;
6043                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6044                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6045                 }
6046             }
6047         } else {
6048             uint8_t * const nnz= &h->non_zero_count_cache[0];
6049             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6050             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6051         }
6052     } else {
6053         uint8_t * const nnz= &h->non_zero_count_cache[0];
6054         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6055         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6056         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6057         h->last_qscale_diff = 0;
6058     }
6059
6060     s->current_picture.qscale_table[mb_xy]= s->qscale;
6061     write_back_non_zero_count(h);
6062
6063     if(MB_MBAFF){
6064         h->ref_count[0] >>= 1;
6065         h->ref_count[1] >>= 1;
6066     }
6067
6068     return 0;
6069 }
6070
6071
6072 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6073     int i, d;
6074     const int index_a = qp + h->slice_alpha_c0_offset;
6075     const int alpha = (alpha_table+52)[index_a];
6076     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6077
6078     if( bS[0] < 4 ) {
6079         int8_t tc[4];
6080         for(i=0; i<4; i++)
6081             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6082         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6083     } else {
6084         /* 16px edge length, because bS=4 is triggered by being at
6085          * the edge of an intra MB, so all 4 bS are the same */
6086             for( d = 0; d < 16; d++ ) {
6087                 const int p0 = pix[-1];
6088                 const int p1 = pix[-2];
6089                 const int p2 = pix[-3];
6090
6091                 const int q0 = pix[0];
6092                 const int q1 = pix[1];
6093                 const int q2 = pix[2];
6094
6095                 if( FFABS( p0 - q0 ) < alpha &&
6096                     FFABS( p1 - p0 ) < beta &&
6097                     FFABS( q1 - q0 ) < beta ) {
6098
6099                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6100                         if( FFABS( p2 - p0 ) < beta)
6101                         {
6102                             const int p3 = pix[-4];
6103                             /* p0', p1', p2' */
6104                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6105                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6106                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6107                         } else {
6108                             /* p0' */
6109                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6110                         }
6111                         if( FFABS( q2 - q0 ) < beta)
6112                         {
6113                             const int q3 = pix[3];
6114                             /* q0', q1', q2' */
6115                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6116                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6117                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6118                         } else {
6119                             /* q0' */
6120                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6121                         }
6122                     }else{
6123                         /* p0', q0' */
6124                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6125                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6126                     }
6127                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6128                 }
6129                 pix += stride;
6130             }
6131     }
6132 }
6133 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6134     int i;
6135     const int index_a = qp + h->slice_alpha_c0_offset;
6136     const int alpha = (alpha_table+52)[index_a];
6137     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6138
6139     if( bS[0] < 4 ) {
6140         int8_t tc[4];
6141         for(i=0; i<4; i++)
6142             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6143         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6144     } else {
6145         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6146     }
6147 }
6148
6149 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6150     int i;
6151     for( i = 0; i < 16; i++, pix += stride) {
6152         int index_a;
6153         int alpha;
6154         int beta;
6155
6156         int qp_index;
6157         int bS_index = (i >> 1);
6158         if (!MB_FIELD) {
6159             bS_index &= ~1;
6160             bS_index |= (i & 1);
6161         }
6162
6163         if( bS[bS_index] == 0 ) {
6164             continue;
6165         }
6166
6167         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6168         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6169         alpha = (alpha_table+52)[index_a];
6170         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6171
6172         if( bS[bS_index] < 4 ) {
6173             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6174             const int p0 = pix[-1];
6175             const int p1 = pix[-2];
6176             const int p2 = pix[-3];
6177             const int q0 = pix[0];
6178             const int q1 = pix[1];
6179             const int q2 = pix[2];
6180
6181             if( FFABS( p0 - q0 ) < alpha &&
6182                 FFABS( p1 - p0 ) < beta &&
6183                 FFABS( q1 - q0 ) < beta ) {
6184                 int tc = tc0;
6185                 int i_delta;
6186
6187                 if( FFABS( p2 - p0 ) < beta ) {
6188                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6189                     tc++;
6190                 }
6191                 if( FFABS( q2 - q0 ) < beta ) {
6192                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6193                     tc++;
6194                 }
6195
6196                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6197                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6198                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6199                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6200             }
6201         }else{
6202             const int p0 = pix[-1];
6203             const int p1 = pix[-2];
6204             const int p2 = pix[-3];
6205
6206             const int q0 = pix[0];
6207             const int q1 = pix[1];
6208             const int q2 = pix[2];
6209
6210             if( FFABS( p0 - q0 ) < alpha &&
6211                 FFABS( p1 - p0 ) < beta &&
6212                 FFABS( q1 - q0 ) < beta ) {
6213
6214                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6215                     if( FFABS( p2 - p0 ) < beta)
6216                     {
6217                         const int p3 = pix[-4];
6218                         /* p0', p1', p2' */
6219                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6220                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6221                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6222                     } else {
6223                         /* p0' */
6224                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6225                     }
6226                     if( FFABS( q2 - q0 ) < beta)
6227                     {
6228                         const int q3 = pix[3];
6229                         /* q0', q1', q2' */
6230                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6231                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6232                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6233                     } else {
6234                         /* q0' */
6235                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6236                     }
6237                 }else{
6238                     /* p0', q0' */
6239                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6240                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6241                 }
6242                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6243             }
6244         }
6245     }
6246 }
6247 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6248     int i;
6249     for( i = 0; i < 8; i++, pix += stride) {
6250         int index_a;
6251         int alpha;
6252         int beta;
6253
6254         int qp_index;
6255         int bS_index = i;
6256
6257         if( bS[bS_index] == 0 ) {
6258             continue;
6259         }
6260
6261         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6262         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6263         alpha = (alpha_table+52)[index_a];
6264         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6265
6266         if( bS[bS_index] < 4 ) {
6267             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6268             const int p0 = pix[-1];
6269             const int p1 = pix[-2];
6270             const int q0 = pix[0];
6271             const int q1 = pix[1];
6272
6273             if( FFABS( p0 - q0 ) < alpha &&
6274                 FFABS( p1 - p0 ) < beta &&
6275                 FFABS( q1 - q0 ) < beta ) {
6276                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6277
6278                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6279                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6280                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6281             }
6282         }else{
6283             const int p0 = pix[-1];
6284             const int p1 = pix[-2];
6285             const int q0 = pix[0];
6286             const int q1 = pix[1];
6287
6288             if( FFABS( p0 - q0 ) < alpha &&
6289                 FFABS( p1 - p0 ) < beta &&
6290                 FFABS( q1 - q0 ) < beta ) {
6291
6292                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6293                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6294                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6295             }
6296         }
6297     }
6298 }
6299
6300 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6301     int i, d;
6302     const int index_a = qp + h->slice_alpha_c0_offset;
6303     const int alpha = (alpha_table+52)[index_a];
6304     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6305     const int pix_next  = stride;
6306
6307     if( bS[0] < 4 ) {
6308         int8_t tc[4];
6309         for(i=0; i<4; i++)
6310             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6311         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6312     } else {
6313         /* 16px edge length, see filter_mb_edgev */
6314             for( d = 0; d < 16; d++ ) {
6315                 const int p0 = pix[-1*pix_next];
6316                 const int p1 = pix[-2*pix_next];
6317                 const int p2 = pix[-3*pix_next];
6318                 const int q0 = pix[0];
6319                 const int q1 = pix[1*pix_next];
6320                 const int q2 = pix[2*pix_next];
6321
6322                 if( FFABS( p0 - q0 ) < alpha &&
6323                     FFABS( p1 - p0 ) < beta &&
6324                     FFABS( q1 - q0 ) < beta ) {
6325
6326                     const int p3 = pix[-4*pix_next];
6327                     const int q3 = pix[ 3*pix_next];
6328
6329                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6330                         if( FFABS( p2 - p0 ) < beta) {
6331                             /* p0', p1', p2' */
6332                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6333                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6334                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6335                         } else {
6336                             /* p0' */
6337                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6338                         }
6339                         if( FFABS( q2 - q0 ) < beta) {
6340                             /* q0', q1', q2' */
6341                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6342                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6343                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6344                         } else {
6345                             /* q0' */
6346                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6347                         }
6348                     }else{
6349                         /* p0', q0' */
6350                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6351                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6352                     }
6353                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6354                 }
6355                 pix++;
6356             }
6357     }
6358 }
6359
6360 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6361     int i;
6362     const int index_a = qp + h->slice_alpha_c0_offset;
6363     const int alpha = (alpha_table+52)[index_a];
6364     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6365
6366     if( bS[0] < 4 ) {
6367         int8_t tc[4];
6368         for(i=0; i<4; i++)
6369             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6370         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6371     } else {
6372         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6373     }
6374 }
6375
6376 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6377     MpegEncContext * const s = &h->s;
6378     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6379     int mb_xy, mb_type;
6380     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6381
6382     mb_xy = mb_x + mb_y*s->mb_stride;
6383
6384     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6385        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6386                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6387         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6388         return;
6389     }
6390     assert(!FRAME_MBAFF);
6391
6392     mb_type = s->current_picture.mb_type[mb_xy];
6393     qp = s->current_picture.qscale_table[mb_xy];
6394     qp0 = s->current_picture.qscale_table[mb_xy-1];
6395     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6396     qpc = get_chroma_qp( h, 0, qp );
6397     qpc0 = get_chroma_qp( h, 0, qp0 );
6398     qpc1 = get_chroma_qp( h, 0, qp1 );
6399     qp0 = (qp + qp0 + 1) >> 1;
6400     qp1 = (qp + qp1 + 1) >> 1;
6401     qpc0 = (qpc + qpc0 + 1) >> 1;
6402     qpc1 = (qpc + qpc1 + 1) >> 1;
6403     qp_thresh = 15 - h->slice_alpha_c0_offset;
6404     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6405        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6406         return;
6407
6408     if( IS_INTRA(mb_type) ) {
6409         int16_t bS4[4] = {4,4,4,4};
6410         int16_t bS3[4] = {3,3,3,3};
6411         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6412         if( IS_8x8DCT(mb_type) ) {
6413             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6414             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6415             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6416             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6417         } else {
6418             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6419             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6420             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6421             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6422             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6423             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6424             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6425             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6426         }
6427         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6428         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6429         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6430         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6431         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6432         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6433         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6434         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6435         return;
6436     } else {
6437         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6438         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6439         int edges;
6440         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6441             edges = 4;
6442             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6443         } else {
6444             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6445                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6446             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6447                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6448                              ? 3 : 0;
6449             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6450             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6451             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6452                                               (h->slice_type == FF_B_TYPE), edges, step, mask_edge0, mask_edge1 );
6453         }
6454         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6455             bSv[0][0] = 0x0004000400040004ULL;
6456         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6457             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6458
6459 #define FILTER(hv,dir,edge)\
6460         if(bSv[dir][edge]) {\
6461             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6462             if(!(edge&1)) {\
6463                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6464                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6465             }\
6466         }
6467         if( edges == 1 ) {
6468             FILTER(v,0,0);
6469             FILTER(h,1,0);
6470         } else if( IS_8x8DCT(mb_type) ) {
6471             FILTER(v,0,0);
6472             FILTER(v,0,2);
6473             FILTER(h,1,0);
6474             FILTER(h,1,2);
6475         } else {
6476             FILTER(v,0,0);
6477             FILTER(v,0,1);
6478             FILTER(v,0,2);
6479             FILTER(v,0,3);
6480             FILTER(h,1,0);
6481             FILTER(h,1,1);
6482             FILTER(h,1,2);
6483             FILTER(h,1,3);
6484         }
6485 #undef FILTER
6486     }
6487 }
6488
6489 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6490     MpegEncContext * const s = &h->s;
6491     const int mb_xy= mb_x + mb_y*s->mb_stride;
6492     const int mb_type = s->current_picture.mb_type[mb_xy];
6493     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6494     int first_vertical_edge_done = 0;
6495     int dir;
6496     /* FIXME: A given frame may occupy more than one position in
6497      * the reference list. So ref2frm should be populated with
6498      * frame numbers, not indices. */
6499     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6500                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6501
6502     //for sufficiently low qp, filtering wouldn't do anything
6503     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6504     if(!FRAME_MBAFF){
6505         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6506         int qp = s->current_picture.qscale_table[mb_xy];
6507         if(qp <= qp_thresh
6508            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6509            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6510             return;
6511         }
6512     }
6513
6514     if (FRAME_MBAFF
6515             // left mb is in picture
6516             && h->slice_table[mb_xy-1] != 255
6517             // and current and left pair do not have the same interlaced type
6518             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6519             // and left mb is in the same slice if deblocking_filter == 2
6520             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6521         /* First vertical edge is different in MBAFF frames
6522          * There are 8 different bS to compute and 2 different Qp
6523          */
6524         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6525         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6526         int16_t bS[8];
6527         int qp[2];
6528         int bqp[2];
6529         int rqp[2];
6530         int mb_qp, mbn0_qp, mbn1_qp;
6531         int i;
6532         first_vertical_edge_done = 1;
6533
6534         if( IS_INTRA(mb_type) )
6535             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6536         else {
6537             for( i = 0; i < 8; i++ ) {
6538                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6539
6540                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6541                     bS[i] = 4;
6542                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6543                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6544                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6545                     bS[i] = 2;
6546                 else
6547                     bS[i] = 1;
6548             }
6549         }
6550
6551         mb_qp = s->current_picture.qscale_table[mb_xy];
6552         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6553         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6554         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6555         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6556                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6557         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6558                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6559         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6560         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6561                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6562         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6563                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6564
6565         /* Filter edge */
6566         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6567         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6568         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6569         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6570         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6571     }
6572     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6573     for( dir = 0; dir < 2; dir++ )
6574     {
6575         int edge;
6576         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6577         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6578         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6579
6580         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6581                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6582         // how often to recheck mv-based bS when iterating between edges
6583         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6584                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6585         // how often to recheck mv-based bS when iterating along each edge
6586         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6587
6588         if (first_vertical_edge_done) {
6589             start = 1;
6590             first_vertical_edge_done = 0;
6591         }
6592
6593         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6594             start = 1;
6595
6596         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6597             && !IS_INTERLACED(mb_type)
6598             && IS_INTERLACED(mbm_type)
6599             ) {
6600             // This is a special case in the norm where the filtering must
6601             // be done twice (one each of the field) even if we are in a
6602             // frame macroblock.
6603             //
6604             static const int nnz_idx[4] = {4,5,6,3};
6605             unsigned int tmp_linesize   = 2 *   linesize;
6606             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6607             int mbn_xy = mb_xy - 2 * s->mb_stride;
6608             int qp;
6609             int i, j;
6610             int16_t bS[4];
6611
6612             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6613                 if( IS_INTRA(mb_type) ||
6614                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6615                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6616                 } else {
6617                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6618                     for( i = 0; i < 4; i++ ) {
6619                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6620                             mbn_nnz[nnz_idx[i]] != 0 )
6621                             bS[i] = 2;
6622                         else
6623                             bS[i] = 1;
6624                     }
6625                 }
6626                 // Do not use s->qscale as luma quantizer because it has not the same
6627                 // value in IPCM macroblocks.
6628                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6629                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6630                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6631                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6632                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6633                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6634                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6635                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6636             }
6637
6638             start = 1;
6639         }
6640
6641         /* Calculate bS */
6642         for( edge = start; edge < edges; edge++ ) {
6643             /* mbn_xy: neighbor macroblock */
6644             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6645             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6646             int16_t bS[4];
6647             int qp;
6648
6649             if( (edge&1) && IS_8x8DCT(mb_type) )
6650                 continue;
6651
6652             if( IS_INTRA(mb_type) ||
6653                 IS_INTRA(mbn_type) ) {
6654                 int value;
6655                 if (edge == 0) {
6656                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6657                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6658                     ) {
6659                         value = 4;
6660                     } else {
6661                         value = 3;
6662                     }
6663                 } else {
6664                     value = 3;
6665                 }
6666                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6667             } else {
6668                 int i, l;
6669                 int mv_done;
6670
6671                 if( edge & mask_edge ) {
6672                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6673                     mv_done = 1;
6674                 }
6675                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6676                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6677                     mv_done = 1;
6678                 }
6679                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6680                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6681                     int bn_idx= b_idx - (dir ? 8:1);
6682                     int v = 0;
6683                     for( l = 0; !v && l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6684                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6685                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6686                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6687                     }
6688                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6689                     mv_done = 1;
6690                 }
6691                 else
6692                     mv_done = 0;
6693
6694                 for( i = 0; i < 4; i++ ) {
6695                     int x = dir == 0 ? edge : i;
6696                     int y = dir == 0 ? i    : edge;
6697                     int b_idx= 8 + 4 + x + 8*y;
6698                     int bn_idx= b_idx - (dir ? 8:1);
6699
6700                     if( h->non_zero_count_cache[b_idx] != 0 ||
6701                         h->non_zero_count_cache[bn_idx] != 0 ) {
6702                         bS[i] = 2;
6703                     }
6704                     else if(!mv_done)
6705                     {
6706                         bS[i] = 0;
6707                         for( l = 0; l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6708                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6709                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6710                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6711                                 bS[i] = 1;
6712                                 break;
6713                             }
6714                         }
6715                     }
6716                 }
6717
6718                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6719                     continue;
6720             }
6721
6722             /* Filter edge */
6723             // Do not use s->qscale as luma quantizer because it has not the same
6724             // value in IPCM macroblocks.
6725             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6726             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6727             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6728             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6729             if( dir == 0 ) {
6730                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6731                 if( (edge&1) == 0 ) {
6732                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6733                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6734                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6735                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6736                 }
6737             } else {
6738                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6739                 if( (edge&1) == 0 ) {
6740                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6741                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6742                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6743                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6744                 }
6745             }
6746         }
6747     }
6748 }
6749
6750 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6751     MpegEncContext * const s = &h->s;
6752     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6753
6754     s->mb_skip_run= -1;
6755
6756     if( h->pps.cabac ) {
6757         int i;
6758
6759         /* realign */
6760         align_get_bits( &s->gb );
6761
6762         /* init cabac */
6763         ff_init_cabac_states( &h->cabac);
6764         ff_init_cabac_decoder( &h->cabac,
6765                                s->gb.buffer + get_bits_count(&s->gb)/8,
6766                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6767         /* calculate pre-state */
6768         for( i= 0; i < 460; i++ ) {
6769             int pre;
6770             if( h->slice_type == FF_I_TYPE )
6771                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6772             else
6773                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6774
6775             if( pre <= 63 )
6776                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6777             else
6778                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6779         }
6780
6781         for(;;){
6782 //START_TIMER
6783             int ret = decode_mb_cabac(h);
6784             int eos;
6785 //STOP_TIMER("decode_mb_cabac")
6786
6787             if(ret>=0) hl_decode_mb(h);
6788
6789             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6790                 s->mb_y++;
6791
6792                 if(ret>=0) ret = decode_mb_cabac(h);
6793
6794                 if(ret>=0) hl_decode_mb(h);
6795                 s->mb_y--;
6796             }
6797             eos = get_cabac_terminate( &h->cabac );
6798
6799             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6800                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6801                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6802                 return -1;
6803             }
6804
6805             if( ++s->mb_x >= s->mb_width ) {
6806                 s->mb_x = 0;
6807                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6808                 ++s->mb_y;
6809                 if(FIELD_OR_MBAFF_PICTURE) {
6810                     ++s->mb_y;
6811                 }
6812             }
6813
6814             if( eos || s->mb_y >= s->mb_height ) {
6815                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6816                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6817                 return 0;
6818             }
6819         }
6820
6821     } else {
6822         for(;;){
6823             int ret = decode_mb_cavlc(h);
6824
6825             if(ret>=0) hl_decode_mb(h);
6826
6827             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6828                 s->mb_y++;
6829                 ret = decode_mb_cavlc(h);
6830
6831                 if(ret>=0) hl_decode_mb(h);
6832                 s->mb_y--;
6833             }
6834
6835             if(ret<0){
6836                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6837                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6838
6839                 return -1;
6840             }
6841
6842             if(++s->mb_x >= s->mb_width){
6843                 s->mb_x=0;
6844                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6845                 ++s->mb_y;
6846                 if(FIELD_OR_MBAFF_PICTURE) {
6847                     ++s->mb_y;
6848                 }
6849                 if(s->mb_y >= s->mb_height){
6850                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6851
6852                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6853                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6854
6855                         return 0;
6856                     }else{
6857                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6858
6859                         return -1;
6860                     }
6861                 }
6862             }
6863
6864             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6865                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6866                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6867                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6868
6869                     return 0;
6870                 }else{
6871                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6872
6873                     return -1;
6874                 }
6875             }
6876         }
6877     }
6878
6879 #if 0
6880     for(;s->mb_y < s->mb_height; s->mb_y++){
6881         for(;s->mb_x < s->mb_width; s->mb_x++){
6882             int ret= decode_mb(h);
6883
6884             hl_decode_mb(h);
6885
6886             if(ret<0){
6887                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6888                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6889
6890                 return -1;
6891             }
6892
6893             if(++s->mb_x >= s->mb_width){
6894                 s->mb_x=0;
6895                 if(++s->mb_y >= s->mb_height){
6896                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6897                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6898
6899                         return 0;
6900                     }else{
6901                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6902
6903                         return -1;
6904                     }
6905                 }
6906             }
6907
6908             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6909                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6910                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6911
6912                     return 0;
6913                 }else{
6914                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6915
6916                     return -1;
6917                 }
6918             }
6919         }
6920         s->mb_x=0;
6921         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6922     }
6923 #endif
6924     return -1; //not reached
6925 }
6926
6927 static int decode_unregistered_user_data(H264Context *h, int size){
6928     MpegEncContext * const s = &h->s;
6929     uint8_t user_data[16+256];
6930     int e, build, i;
6931
6932     if(size<16)
6933         return -1;
6934
6935     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6936         user_data[i]= get_bits(&s->gb, 8);
6937     }
6938
6939     user_data[i]= 0;
6940     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6941     if(e==1 && build>=0)
6942         h->x264_build= build;
6943
6944     if(s->avctx->debug & FF_DEBUG_BUGS)
6945         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6946
6947     for(; i<size; i++)
6948         skip_bits(&s->gb, 8);
6949
6950     return 0;
6951 }
6952
6953 static int decode_sei(H264Context *h){
6954     MpegEncContext * const s = &h->s;
6955
6956     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6957         int size, type;
6958
6959         type=0;
6960         do{
6961             type+= show_bits(&s->gb, 8);
6962         }while(get_bits(&s->gb, 8) == 255);
6963
6964         size=0;
6965         do{
6966             size+= show_bits(&s->gb, 8);
6967         }while(get_bits(&s->gb, 8) == 255);
6968
6969         switch(type){
6970         case 5:
6971             if(decode_unregistered_user_data(h, size) < 0)
6972                 return -1;
6973             break;
6974         default:
6975             skip_bits(&s->gb, 8*size);
6976         }
6977
6978         //FIXME check bits here
6979         align_get_bits(&s->gb);
6980     }
6981
6982     return 0;
6983 }
6984
6985 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6986     MpegEncContext * const s = &h->s;
6987     int cpb_count, i;
6988     cpb_count = get_ue_golomb(&s->gb) + 1;
6989     get_bits(&s->gb, 4); /* bit_rate_scale */
6990     get_bits(&s->gb, 4); /* cpb_size_scale */
6991     for(i=0; i<cpb_count; i++){
6992         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6993         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6994         get_bits1(&s->gb);     /* cbr_flag */
6995     }
6996     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6997     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6998     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6999     get_bits(&s->gb, 5); /* time_offset_length */
7000 }
7001
7002 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7003     MpegEncContext * const s = &h->s;
7004     int aspect_ratio_info_present_flag;
7005     unsigned int aspect_ratio_idc;
7006     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7007
7008     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7009
7010     if( aspect_ratio_info_present_flag ) {
7011         aspect_ratio_idc= get_bits(&s->gb, 8);
7012         if( aspect_ratio_idc == EXTENDED_SAR ) {
7013             sps->sar.num= get_bits(&s->gb, 16);
7014             sps->sar.den= get_bits(&s->gb, 16);
7015         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7016             sps->sar=  pixel_aspect[aspect_ratio_idc];
7017         }else{
7018             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7019             return -1;
7020         }
7021     }else{
7022         sps->sar.num=
7023         sps->sar.den= 0;
7024     }
7025 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7026
7027     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7028         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7029     }
7030
7031     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7032         get_bits(&s->gb, 3);    /* video_format */
7033         get_bits1(&s->gb);      /* video_full_range_flag */
7034         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7035             get_bits(&s->gb, 8); /* colour_primaries */
7036             get_bits(&s->gb, 8); /* transfer_characteristics */
7037             get_bits(&s->gb, 8); /* matrix_coefficients */
7038         }
7039     }
7040
7041     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7042         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7043         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7044     }
7045
7046     sps->timing_info_present_flag = get_bits1(&s->gb);
7047     if(sps->timing_info_present_flag){
7048         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7049         sps->time_scale = get_bits_long(&s->gb, 32);
7050         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7051     }
7052
7053     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7054     if(nal_hrd_parameters_present_flag)
7055         decode_hrd_parameters(h, sps);
7056     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7057     if(vcl_hrd_parameters_present_flag)
7058         decode_hrd_parameters(h, sps);
7059     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7060         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7061     get_bits1(&s->gb);         /* pic_struct_present_flag */
7062
7063     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7064     if(sps->bitstream_restriction_flag){
7065         unsigned int num_reorder_frames;
7066         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7067         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7068         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7069         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7070         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7071         num_reorder_frames= get_ue_golomb(&s->gb);
7072         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7073
7074         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7075             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7076             return -1;
7077         }
7078
7079         sps->num_reorder_frames= num_reorder_frames;
7080     }
7081
7082     return 0;
7083 }
7084
7085 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7086                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7087     MpegEncContext * const s = &h->s;
7088     int i, last = 8, next = 8;
7089     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7090     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7091         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7092     else
7093     for(i=0;i<size;i++){
7094         if(next)
7095             next = (last + get_se_golomb(&s->gb)) & 0xff;
7096         if(!i && !next){ /* matrix not written, we use the preset one */
7097             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7098             break;
7099         }
7100         last = factors[scan[i]] = next ? next : last;
7101     }
7102 }
7103
7104 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7105                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7106     MpegEncContext * const s = &h->s;
7107     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7108     const uint8_t *fallback[4] = {
7109         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7110         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7111         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7112         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7113     };
7114     if(get_bits1(&s->gb)){
7115         sps->scaling_matrix_present |= is_sps;
7116         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7117         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7118         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7119         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7120         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7121         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7122         if(is_sps || pps->transform_8x8_mode){
7123             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7124             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7125         }
7126     } else if(fallback_sps) {
7127         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7128         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7129     }
7130 }
7131
7132 /**
7133  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7134  */
7135 static void *
7136 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7137                     const size_t size, const char *name)
7138 {
7139     if(id>=max) {
7140         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7141         return NULL;
7142     }
7143
7144     if(!vec[id]) {
7145         vec[id] = av_mallocz(size);
7146         if(vec[id] == NULL)
7147             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7148     }
7149     return vec[id];
7150 }
7151
7152 static inline int decode_seq_parameter_set(H264Context *h){
7153     MpegEncContext * const s = &h->s;
7154     int profile_idc, level_idc;
7155     unsigned int sps_id, tmp, mb_width, mb_height;
7156     int i;
7157     SPS *sps;
7158
7159     profile_idc= get_bits(&s->gb, 8);
7160     get_bits1(&s->gb);   //constraint_set0_flag
7161     get_bits1(&s->gb);   //constraint_set1_flag
7162     get_bits1(&s->gb);   //constraint_set2_flag
7163     get_bits1(&s->gb);   //constraint_set3_flag
7164     get_bits(&s->gb, 4); // reserved
7165     level_idc= get_bits(&s->gb, 8);
7166     sps_id= get_ue_golomb(&s->gb);
7167
7168     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7169     if(sps == NULL)
7170         return -1;
7171
7172     sps->profile_idc= profile_idc;
7173     sps->level_idc= level_idc;
7174
7175     if(sps->profile_idc >= 100){ //high profile
7176         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7177             get_bits1(&s->gb);  //residual_color_transform_flag
7178         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7179         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7180         sps->transform_bypass = get_bits1(&s->gb);
7181         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7182     }else
7183         sps->scaling_matrix_present = 0;
7184
7185     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7186     sps->poc_type= get_ue_golomb(&s->gb);
7187
7188     if(sps->poc_type == 0){ //FIXME #define
7189         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7190     } else if(sps->poc_type == 1){//FIXME #define
7191         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7192         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7193         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7194         tmp= get_ue_golomb(&s->gb);
7195
7196         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7197             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7198             return -1;
7199         }
7200         sps->poc_cycle_length= tmp;
7201
7202         for(i=0; i<sps->poc_cycle_length; i++)
7203             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7204     }else if(sps->poc_type != 2){
7205         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7206         return -1;
7207     }
7208
7209     tmp= get_ue_golomb(&s->gb);
7210     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7211         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7212         return -1;
7213     }
7214     sps->ref_frame_count= tmp;
7215     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7216     mb_width= get_ue_golomb(&s->gb) + 1;
7217     mb_height= get_ue_golomb(&s->gb) + 1;
7218     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7219        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7220         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7221         return -1;
7222     }
7223     sps->mb_width = mb_width;
7224     sps->mb_height= mb_height;
7225
7226     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7227     if(!sps->frame_mbs_only_flag)
7228         sps->mb_aff= get_bits1(&s->gb);
7229     else
7230         sps->mb_aff= 0;
7231
7232     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7233
7234 #ifndef ALLOW_INTERLACE
7235     if(sps->mb_aff)
7236         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7237 #endif
7238     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7239         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7240
7241     sps->crop= get_bits1(&s->gb);
7242     if(sps->crop){
7243         sps->crop_left  = get_ue_golomb(&s->gb);
7244         sps->crop_right = get_ue_golomb(&s->gb);
7245         sps->crop_top   = get_ue_golomb(&s->gb);
7246         sps->crop_bottom= get_ue_golomb(&s->gb);
7247         if(sps->crop_left || sps->crop_top){
7248             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7249         }
7250         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7251             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7252         }
7253     }else{
7254         sps->crop_left  =
7255         sps->crop_right =
7256         sps->crop_top   =
7257         sps->crop_bottom= 0;
7258     }
7259
7260     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7261     if( sps->vui_parameters_present_flag )
7262         decode_vui_parameters(h, sps);
7263
7264     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7265         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7266                sps_id, sps->profile_idc, sps->level_idc,
7267                sps->poc_type,
7268                sps->ref_frame_count,
7269                sps->mb_width, sps->mb_height,
7270                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7271                sps->direct_8x8_inference_flag ? "8B8" : "",
7272                sps->crop_left, sps->crop_right,
7273                sps->crop_top, sps->crop_bottom,
7274                sps->vui_parameters_present_flag ? "VUI" : ""
7275                );
7276     }
7277     return 0;
7278 }
7279
7280 static void
7281 build_qp_table(PPS *pps, int t, int index)
7282 {
7283     int i;
7284     for(i = 0; i < 255; i++)
7285         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7286 }
7287
7288 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7289     MpegEncContext * const s = &h->s;
7290     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7291     PPS *pps;
7292
7293     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7294     if(pps == NULL)
7295         return -1;
7296
7297     tmp= get_ue_golomb(&s->gb);
7298     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7299         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7300         return -1;
7301     }
7302     pps->sps_id= tmp;
7303
7304     pps->cabac= get_bits1(&s->gb);
7305     pps->pic_order_present= get_bits1(&s->gb);
7306     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7307     if(pps->slice_group_count > 1 ){
7308         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7309         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7310         switch(pps->mb_slice_group_map_type){
7311         case 0:
7312 #if 0
7313 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7314 |    run_length[ i ]                                |1  |ue(v)   |
7315 #endif
7316             break;
7317         case 2:
7318 #if 0
7319 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7320 |{                                                  |   |        |
7321 |    top_left_mb[ i ]                               |1  |ue(v)   |
7322 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7323 |   }                                               |   |        |
7324 #endif
7325             break;
7326         case 3:
7327         case 4:
7328         case 5:
7329 #if 0
7330 |   slice_group_change_direction_flag               |1  |u(1)    |
7331 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7332 #endif
7333             break;
7334         case 6:
7335 #if 0
7336 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7337 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7338 |)                                                  |   |        |
7339 |    slice_group_id[ i ]                            |1  |u(v)    |
7340 #endif
7341             break;
7342         }
7343     }
7344     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7345     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7346     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7347         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7348         pps->ref_count[0]= pps->ref_count[1]= 1;
7349         return -1;
7350     }
7351
7352     pps->weighted_pred= get_bits1(&s->gb);
7353     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7354     pps->init_qp= get_se_golomb(&s->gb) + 26;
7355     pps->init_qs= get_se_golomb(&s->gb) + 26;
7356     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7357     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7358     pps->constrained_intra_pred= get_bits1(&s->gb);
7359     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7360
7361     pps->transform_8x8_mode= 0;
7362     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7363     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7364     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7365
7366     if(get_bits_count(&s->gb) < bit_length){
7367         pps->transform_8x8_mode= get_bits1(&s->gb);
7368         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7369         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7370     } else {
7371         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7372     }
7373
7374     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7375     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7376         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7377         h->pps.chroma_qp_diff= 1;
7378     } else
7379         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7380
7381     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7382         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7383                pps_id, pps->sps_id,
7384                pps->cabac ? "CABAC" : "CAVLC",
7385                pps->slice_group_count,
7386                pps->ref_count[0], pps->ref_count[1],
7387                pps->weighted_pred ? "weighted" : "",
7388                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7389                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7390                pps->constrained_intra_pred ? "CONSTR" : "",
7391                pps->redundant_pic_cnt_present ? "REDU" : "",
7392                pps->transform_8x8_mode ? "8x8DCT" : ""
7393                );
7394     }
7395
7396     return 0;
7397 }
7398
7399 /**
7400  * Call decode_slice() for each context.
7401  *
7402  * @param h h264 master context
7403  * @param context_count number of contexts to execute
7404  */
7405 static void execute_decode_slices(H264Context *h, int context_count){
7406     MpegEncContext * const s = &h->s;
7407     AVCodecContext * const avctx= s->avctx;
7408     H264Context *hx;
7409     int i;
7410
7411     if(context_count == 1) {
7412         decode_slice(avctx, h);
7413     } else {
7414         for(i = 1; i < context_count; i++) {
7415             hx = h->thread_context[i];
7416             hx->s.error_resilience = avctx->error_resilience;
7417             hx->s.error_count = 0;
7418         }
7419
7420         avctx->execute(avctx, (void *)decode_slice,
7421                        (void **)h->thread_context, NULL, context_count);
7422
7423         /* pull back stuff from slices to master context */
7424         hx = h->thread_context[context_count - 1];
7425         s->mb_x = hx->s.mb_x;
7426         s->mb_y = hx->s.mb_y;
7427         s->dropable = hx->s.dropable;
7428         s->picture_structure = hx->s.picture_structure;
7429         for(i = 1; i < context_count; i++)
7430             h->s.error_count += h->thread_context[i]->s.error_count;
7431     }
7432 }
7433
7434
7435 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7436     MpegEncContext * const s = &h->s;
7437     AVCodecContext * const avctx= s->avctx;
7438     int buf_index=0;
7439     H264Context *hx; ///< thread context
7440     int context_count = 0;
7441
7442     h->max_contexts = avctx->thread_count;
7443 #if 0
7444     int i;
7445     for(i=0; i<50; i++){
7446         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7447     }
7448 #endif
7449     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7450         h->current_slice = 0;
7451         if (!s->first_field)
7452             s->current_picture_ptr= NULL;
7453     }
7454
7455     for(;;){
7456         int consumed;
7457         int dst_length;
7458         int bit_length;
7459         const uint8_t *ptr;
7460         int i, nalsize = 0;
7461         int err;
7462
7463         if(h->is_avc) {
7464             if(buf_index >= buf_size) break;
7465             nalsize = 0;
7466             for(i = 0; i < h->nal_length_size; i++)
7467                 nalsize = (nalsize << 8) | buf[buf_index++];
7468             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7469                 if(nalsize == 1){
7470                     buf_index++;
7471                     continue;
7472                 }else{
7473                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7474                     break;
7475                 }
7476             }
7477         } else {
7478             // start code prefix search
7479             for(; buf_index + 3 < buf_size; buf_index++){
7480                 // This should always succeed in the first iteration.
7481                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7482                     break;
7483             }
7484
7485             if(buf_index+3 >= buf_size) break;
7486
7487             buf_index+=3;
7488         }
7489
7490         hx = h->thread_context[context_count];
7491
7492         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7493         if (ptr==NULL || dst_length < 0){
7494             return -1;
7495         }
7496         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7497             dst_length--;
7498         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7499
7500         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7501             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7502         }
7503
7504         if (h->is_avc && (nalsize != consumed)){
7505             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7506             consumed= nalsize;
7507         }
7508
7509         buf_index += consumed;
7510
7511         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7512            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7513             continue;
7514
7515       again:
7516         err = 0;
7517         switch(hx->nal_unit_type){
7518         case NAL_IDR_SLICE:
7519             if (h->nal_unit_type != NAL_IDR_SLICE) {
7520                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7521                 return -1;
7522             }
7523             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7524         case NAL_SLICE:
7525             init_get_bits(&hx->s.gb, ptr, bit_length);
7526             hx->intra_gb_ptr=
7527             hx->inter_gb_ptr= &hx->s.gb;
7528             hx->s.data_partitioning = 0;
7529
7530             if((err = decode_slice_header(hx, h)))
7531                break;
7532
7533             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7534             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7535                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7536                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7537                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7538                && avctx->skip_frame < AVDISCARD_ALL)
7539                 context_count++;
7540             break;
7541         case NAL_DPA:
7542             init_get_bits(&hx->s.gb, ptr, bit_length);
7543             hx->intra_gb_ptr=
7544             hx->inter_gb_ptr= NULL;
7545             hx->s.data_partitioning = 1;
7546
7547             err = decode_slice_header(hx, h);
7548             break;
7549         case NAL_DPB:
7550             init_get_bits(&hx->intra_gb, ptr, bit_length);
7551             hx->intra_gb_ptr= &hx->intra_gb;
7552             break;
7553         case NAL_DPC:
7554             init_get_bits(&hx->inter_gb, ptr, bit_length);
7555             hx->inter_gb_ptr= &hx->inter_gb;
7556
7557             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7558                && s->context_initialized
7559                && s->hurry_up < 5
7560                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7561                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7562                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7563                && avctx->skip_frame < AVDISCARD_ALL)
7564                 context_count++;
7565             break;
7566         case NAL_SEI:
7567             init_get_bits(&s->gb, ptr, bit_length);
7568             decode_sei(h);
7569             break;
7570         case NAL_SPS:
7571             init_get_bits(&s->gb, ptr, bit_length);
7572             decode_seq_parameter_set(h);
7573
7574             if(s->flags& CODEC_FLAG_LOW_DELAY)
7575                 s->low_delay=1;
7576
7577             if(avctx->has_b_frames < 2)
7578                 avctx->has_b_frames= !s->low_delay;
7579             break;
7580         case NAL_PPS:
7581             init_get_bits(&s->gb, ptr, bit_length);
7582
7583             decode_picture_parameter_set(h, bit_length);
7584
7585             break;
7586         case NAL_AUD:
7587         case NAL_END_SEQUENCE:
7588         case NAL_END_STREAM:
7589         case NAL_FILLER_DATA:
7590         case NAL_SPS_EXT:
7591         case NAL_AUXILIARY_SLICE:
7592             break;
7593         default:
7594             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7595         }
7596
7597         if(context_count == h->max_contexts) {
7598             execute_decode_slices(h, context_count);
7599             context_count = 0;
7600         }
7601
7602         if (err < 0)
7603             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7604         else if(err == 1) {
7605             /* Slice could not be decoded in parallel mode, copy down
7606              * NAL unit stuff to context 0 and restart. Note that
7607              * rbsp_buffer is not transfered, but since we no longer
7608              * run in parallel mode this should not be an issue. */
7609             h->nal_unit_type = hx->nal_unit_type;
7610             h->nal_ref_idc   = hx->nal_ref_idc;
7611             hx = h;
7612             goto again;
7613         }
7614     }
7615     if(context_count)
7616         execute_decode_slices(h, context_count);
7617     return buf_index;
7618 }
7619
7620 /**
7621  * returns the number of bytes consumed for building the current frame
7622  */
7623 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7624     if(s->flags&CODEC_FLAG_TRUNCATED){
7625         pos -= s->parse_context.last_index;
7626         if(pos<0) pos=0; // FIXME remove (unneeded?)
7627
7628         return pos;
7629     }else{
7630         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7631         if(pos+10>buf_size) pos=buf_size; // oops ;)
7632
7633         return pos;
7634     }
7635 }
7636
7637 static int decode_frame(AVCodecContext *avctx,
7638                              void *data, int *data_size,
7639                              const uint8_t *buf, int buf_size)
7640 {
7641     H264Context *h = avctx->priv_data;
7642     MpegEncContext *s = &h->s;
7643     AVFrame *pict = data;
7644     int buf_index;
7645
7646     s->flags= avctx->flags;
7647     s->flags2= avctx->flags2;
7648
7649    /* no supplementary picture */
7650     if (buf_size == 0) {
7651         Picture *out;
7652         int i, out_idx;
7653
7654 //FIXME factorize this with the output code below
7655         out = h->delayed_pic[0];
7656         out_idx = 0;
7657         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7658             if(h->delayed_pic[i]->poc < out->poc){
7659                 out = h->delayed_pic[i];
7660                 out_idx = i;
7661             }
7662
7663         for(i=out_idx; h->delayed_pic[i]; i++)
7664             h->delayed_pic[i] = h->delayed_pic[i+1];
7665
7666         if(out){
7667             *data_size = sizeof(AVFrame);
7668             *pict= *(AVFrame*)out;
7669         }
7670
7671         return 0;
7672     }
7673
7674     if(s->flags&CODEC_FLAG_TRUNCATED){
7675         int next= ff_h264_find_frame_end(h, buf, buf_size);
7676
7677         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7678             return buf_size;
7679 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7680     }
7681
7682     if(h->is_avc && !h->got_avcC) {
7683         int i, cnt, nalsize;
7684         unsigned char *p = avctx->extradata;
7685         if(avctx->extradata_size < 7) {
7686             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7687             return -1;
7688         }
7689         if(*p != 1) {
7690             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7691             return -1;
7692         }
7693         /* sps and pps in the avcC always have length coded with 2 bytes,
7694            so put a fake nal_length_size = 2 while parsing them */
7695         h->nal_length_size = 2;
7696         // Decode sps from avcC
7697         cnt = *(p+5) & 0x1f; // Number of sps
7698         p += 6;
7699         for (i = 0; i < cnt; i++) {
7700             nalsize = AV_RB16(p) + 2;
7701             if(decode_nal_units(h, p, nalsize) < 0) {
7702                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7703                 return -1;
7704             }
7705             p += nalsize;
7706         }
7707         // Decode pps from avcC
7708         cnt = *(p++); // Number of pps
7709         for (i = 0; i < cnt; i++) {
7710             nalsize = AV_RB16(p) + 2;
7711             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7712                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7713                 return -1;
7714             }
7715             p += nalsize;
7716         }
7717         // Now store right nal length size, that will be use to parse all other nals
7718         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7719         // Do not reparse avcC
7720         h->got_avcC = 1;
7721     }
7722
7723     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7724         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7725             return -1;
7726     }
7727
7728     buf_index=decode_nal_units(h, buf, buf_size);
7729     if(buf_index < 0)
7730         return -1;
7731
7732     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7733         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7734         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7735         return -1;
7736     }
7737
7738     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7739         Picture *out = s->current_picture_ptr;
7740         Picture *cur = s->current_picture_ptr;
7741         Picture *prev = h->delayed_output_pic;
7742         int i, pics, cross_idr, out_of_order, out_idx;
7743
7744         s->mb_y= 0;
7745
7746         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7747         s->current_picture_ptr->pict_type= s->pict_type;
7748
7749         h->prev_frame_num_offset= h->frame_num_offset;
7750         h->prev_frame_num= h->frame_num;
7751         if(!s->dropable) {
7752             h->prev_poc_msb= h->poc_msb;
7753             h->prev_poc_lsb= h->poc_lsb;
7754             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7755         }
7756
7757         /*
7758          * FIXME: Error handling code does not seem to support interlaced
7759          * when slices span multiple rows
7760          * The ff_er_add_slice calls don't work right for bottom
7761          * fields; they cause massive erroneous error concealing
7762          * Error marking covers both fields (top and bottom).
7763          * This causes a mismatched s->error_count
7764          * and a bad error table. Further, the error count goes to
7765          * INT_MAX when called for bottom field, because mb_y is
7766          * past end by one (callers fault) and resync_mb_y != 0
7767          * causes problems for the first MB line, too.
7768          */
7769         if (!FIELD_PICTURE)
7770             ff_er_frame_end(s);
7771
7772         MPV_frame_end(s);
7773
7774         if (s->first_field) {
7775             /* Wait for second field. */
7776             *data_size = 0;
7777
7778         } else {
7779             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7780             /* Derive top_field_first from field pocs. */
7781             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7782
7783         //FIXME do something with unavailable reference frames
7784
7785 #if 0 //decode order
7786             *data_size = sizeof(AVFrame);
7787 #else
7788             /* Sort B-frames into display order */
7789
7790             if(h->sps.bitstream_restriction_flag
7791                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7792                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7793                 s->low_delay = 0;
7794             }
7795
7796             pics = 0;
7797             while(h->delayed_pic[pics]) pics++;
7798
7799             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7800
7801             h->delayed_pic[pics++] = cur;
7802             if(cur->reference == 0)
7803                 cur->reference = DELAYED_PIC_REF;
7804
7805             cross_idr = 0;
7806             for(i=0; h->delayed_pic[i]; i++)
7807                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7808                     cross_idr = 1;
7809
7810             out = h->delayed_pic[0];
7811             out_idx = 0;
7812             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7813                 if(h->delayed_pic[i]->poc < out->poc){
7814                     out = h->delayed_pic[i];
7815                     out_idx = i;
7816                 }
7817
7818             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7819             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7820                 { }
7821             else if(prev && pics <= s->avctx->has_b_frames)
7822                 out = prev;
7823             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7824                || (s->low_delay &&
7825                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7826                  || cur->pict_type == FF_B_TYPE)))
7827             {
7828                 s->low_delay = 0;
7829                 s->avctx->has_b_frames++;
7830                 out = prev;
7831             }
7832             else if(out_of_order)
7833                 out = prev;
7834
7835             if(out_of_order || pics > s->avctx->has_b_frames){
7836                 for(i=out_idx; h->delayed_pic[i]; i++)
7837                     h->delayed_pic[i] = h->delayed_pic[i+1];
7838             }
7839
7840             if(prev == out)
7841                 *data_size = 0;
7842             else
7843                 *data_size = sizeof(AVFrame);
7844             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7845                 prev->reference = 0;
7846             h->delayed_output_pic = out;
7847 #endif
7848
7849             if(out)
7850                 *pict= *(AVFrame*)out;
7851             else
7852                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7853         }
7854     }
7855
7856     assert(pict->data[0] || !*data_size);
7857     ff_print_debug_info(s, pict);
7858 //printf("out %d\n", (int)pict->data[0]);
7859 #if 0 //?
7860
7861     /* Return the Picture timestamp as the frame number */
7862     /* we subtract 1 because it is added on utils.c     */
7863     avctx->frame_number = s->picture_number - 1;
7864 #endif
7865     return get_consumed_bytes(s, buf_index, buf_size);
7866 }
7867 #if 0
7868 static inline void fill_mb_avail(H264Context *h){
7869     MpegEncContext * const s = &h->s;
7870     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7871
7872     if(s->mb_y){
7873         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7874         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7875         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7876     }else{
7877         h->mb_avail[0]=
7878         h->mb_avail[1]=
7879         h->mb_avail[2]= 0;
7880     }
7881     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7882     h->mb_avail[4]= 1; //FIXME move out
7883     h->mb_avail[5]= 0; //FIXME move out
7884 }
7885 #endif
7886
7887 #ifdef TEST
7888 #undef printf
7889 #undef random
7890 #define COUNT 8000
7891 #define SIZE (COUNT*40)
7892 int main(void){
7893     int i;
7894     uint8_t temp[SIZE];
7895     PutBitContext pb;
7896     GetBitContext gb;
7897 //    int int_temp[10000];
7898     DSPContext dsp;
7899     AVCodecContext avctx;
7900
7901     dsputil_init(&dsp, &avctx);
7902
7903     init_put_bits(&pb, temp, SIZE);
7904     printf("testing unsigned exp golomb\n");
7905     for(i=0; i<COUNT; i++){
7906         START_TIMER
7907         set_ue_golomb(&pb, i);
7908         STOP_TIMER("set_ue_golomb");
7909     }
7910     flush_put_bits(&pb);
7911
7912     init_get_bits(&gb, temp, 8*SIZE);
7913     for(i=0; i<COUNT; i++){
7914         int j, s;
7915
7916         s= show_bits(&gb, 24);
7917
7918         START_TIMER
7919         j= get_ue_golomb(&gb);
7920         if(j != i){
7921             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7922 //            return -1;
7923         }
7924         STOP_TIMER("get_ue_golomb");
7925     }
7926
7927
7928     init_put_bits(&pb, temp, SIZE);
7929     printf("testing signed exp golomb\n");
7930     for(i=0; i<COUNT; i++){
7931         START_TIMER
7932         set_se_golomb(&pb, i - COUNT/2);
7933         STOP_TIMER("set_se_golomb");
7934     }
7935     flush_put_bits(&pb);
7936
7937     init_get_bits(&gb, temp, 8*SIZE);
7938     for(i=0; i<COUNT; i++){
7939         int j, s;
7940
7941         s= show_bits(&gb, 24);
7942
7943         START_TIMER
7944         j= get_se_golomb(&gb);
7945         if(j != i - COUNT/2){
7946             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7947 //            return -1;
7948         }
7949         STOP_TIMER("get_se_golomb");
7950     }
7951
7952 #if 0
7953     printf("testing 4x4 (I)DCT\n");
7954
7955     DCTELEM block[16];
7956     uint8_t src[16], ref[16];
7957     uint64_t error= 0, max_error=0;
7958
7959     for(i=0; i<COUNT; i++){
7960         int j;
7961 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7962         for(j=0; j<16; j++){
7963             ref[j]= random()%255;
7964             src[j]= random()%255;
7965         }
7966
7967         h264_diff_dct_c(block, src, ref, 4);
7968
7969         //normalize
7970         for(j=0; j<16; j++){
7971 //            printf("%d ", block[j]);
7972             block[j]= block[j]*4;
7973             if(j&1) block[j]= (block[j]*4 + 2)/5;
7974             if(j&4) block[j]= (block[j]*4 + 2)/5;
7975         }
7976 //        printf("\n");
7977
7978         s->dsp.h264_idct_add(ref, block, 4);
7979 /*        for(j=0; j<16; j++){
7980             printf("%d ", ref[j]);
7981         }
7982         printf("\n");*/
7983
7984         for(j=0; j<16; j++){
7985             int diff= FFABS(src[j] - ref[j]);
7986
7987             error+= diff*diff;
7988             max_error= FFMAX(max_error, diff);
7989         }
7990     }
7991     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7992     printf("testing quantizer\n");
7993     for(qp=0; qp<52; qp++){
7994         for(i=0; i<16; i++)
7995             src1_block[i]= src2_block[i]= random()%255;
7996
7997     }
7998     printf("Testing NAL layer\n");
7999
8000     uint8_t bitstream[COUNT];
8001     uint8_t nal[COUNT*2];
8002     H264Context h;
8003     memset(&h, 0, sizeof(H264Context));
8004
8005     for(i=0; i<COUNT; i++){
8006         int zeros= i;
8007         int nal_length;
8008         int consumed;
8009         int out_length;
8010         uint8_t *out;
8011         int j;
8012
8013         for(j=0; j<COUNT; j++){
8014             bitstream[j]= (random() % 255) + 1;
8015         }
8016
8017         for(j=0; j<zeros; j++){
8018             int pos= random() % COUNT;
8019             while(bitstream[pos] == 0){
8020                 pos++;
8021                 pos %= COUNT;
8022             }
8023             bitstream[pos]=0;
8024         }
8025
8026         START_TIMER
8027
8028         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8029         if(nal_length<0){
8030             printf("encoding failed\n");
8031             return -1;
8032         }
8033
8034         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8035
8036         STOP_TIMER("NAL")
8037
8038         if(out_length != COUNT){
8039             printf("incorrect length %d %d\n", out_length, COUNT);
8040             return -1;
8041         }
8042
8043         if(consumed != nal_length){
8044             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8045             return -1;
8046         }
8047
8048         if(memcmp(bitstream, out, COUNT)){
8049             printf("mismatch\n");
8050             return -1;
8051         }
8052     }
8053 #endif
8054
8055     printf("Testing RBSP\n");
8056
8057
8058     return 0;
8059 }
8060 #endif /* TEST */
8061
8062
8063 static av_cold int decode_end(AVCodecContext *avctx)
8064 {
8065     H264Context *h = avctx->priv_data;
8066     MpegEncContext *s = &h->s;
8067
8068     av_freep(&h->rbsp_buffer[0]);
8069     av_freep(&h->rbsp_buffer[1]);
8070     free_tables(h); //FIXME cleanup init stuff perhaps
8071     MPV_common_end(s);
8072
8073 //    memset(h, 0, sizeof(H264Context));
8074
8075     return 0;
8076 }
8077
8078
8079 AVCodec h264_decoder = {
8080     "h264",
8081     CODEC_TYPE_VIDEO,
8082     CODEC_ID_H264,
8083     sizeof(H264Context),
8084     decode_init,
8085     NULL,
8086     decode_end,
8087     decode_frame,
8088     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8089     .flush= flush_dpb,
8090 };
8091
8092 #include "svq3.c"