git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1005             if(ref[list] < 0)
1006                 ref[list] = -1;
1007         }
1008
1009         if(ref[0] < 0 && ref[1] < 0){
1010             ref[0] = ref[1] = 0;
1011             mv[0][0] = mv[0][1] =
1012             mv[1][0] = mv[1][1] = 0;
1013         }else{
1014             for(list=0; list<2; list++){
1015                 if(ref[list] >= 0)
1016                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1017                 else
1018                     mv[list][0] = mv[list][1] = 0;
1019             }
1020         }
1021
1022         if(ref[1] < 0){
1023             if(!is_b8x8)
1024                 *mb_type &= ~MB_TYPE_L1;
1025             sub_mb_type &= ~MB_TYPE_L1;
1026         }else if(ref[0] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L0;
1029             sub_mb_type &= ~MB_TYPE_L0;
1030         }
1031
1032         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1033             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1034             int mb_types_col[2];
1035             int b8_stride = h->b8_stride;
1036             int b4_stride = h->b_stride;
1037
1038             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1039
1040             if(IS_INTERLACED(*mb_type)){
1041                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1042                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1043                 if(s->mb_y&1){
1044                     l1ref0 -= 2*b8_stride;
1045                     l1ref1 -= 2*b8_stride;
1046                     l1mv0 -= 4*b4_stride;
1047                     l1mv1 -= 4*b4_stride;
1048                 }
1049                 b8_stride *= 3;
1050                 b4_stride *= 6;
1051             }else{
1052                 int cur_poc = s->current_picture_ptr->poc;
1053                 int *col_poc = h->ref_list[1]->field_poc;
1054                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1055                 int dy = 2*col_parity - (s->mb_y&1);
1056                 mb_types_col[0] =
1057                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1058                 l1ref0 += dy*b8_stride;
1059                 l1ref1 += dy*b8_stride;
1060                 l1mv0 += 2*dy*b4_stride;
1061                 l1mv1 += 2*dy*b4_stride;
1062                 b8_stride = 0;
1063             }
1064
1065             for(i8=0; i8<4; i8++){
1066                 int x8 = i8&1;
1067                 int y8 = i8>>1;
1068                 int xy8 = x8+y8*b8_stride;
1069                 int xy4 = 3*x8+y8*b4_stride;
1070                 int a=0, b=0;
1071
1072                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1073                     continue;
1074                 h->sub_mb_type[i8] = sub_mb_type;
1075
1076                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1077                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1078                 if(!IS_INTRA(mb_types_col[y8])
1079                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1080                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1081                     if(ref[0] > 0)
1082                         a= pack16to32(mv[0][0],mv[0][1]);
1083                     if(ref[1] > 0)
1084                         b= pack16to32(mv[1][0],mv[1][1]);
1085                 }else{
1086                     a= pack16to32(mv[0][0],mv[0][1]);
1087                     b= pack16to32(mv[1][0],mv[1][1]);
1088                 }
1089                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1090                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1091             }
1092         }else if(IS_16X16(*mb_type)){
1093             int a=0, b=0;
1094
1095             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1096             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1097             if(!IS_INTRA(mb_type_col)
1098                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1099                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1100                        && (h->x264_build>33 || !h->x264_build)))){
1101                 if(ref[0] > 0)
1102                     a= pack16to32(mv[0][0],mv[0][1]);
1103                 if(ref[1] > 0)
1104                     b= pack16to32(mv[1][0],mv[1][1]);
1105             }else{
1106                 a= pack16to32(mv[0][0],mv[0][1]);
1107                 b= pack16to32(mv[1][0],mv[1][1]);
1108             }
1109             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1110             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1111         }else{
1112             for(i8=0; i8<4; i8++){
1113                 const int x8 = i8&1;
1114                 const int y8 = i8>>1;
1115
1116                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1117                     continue;
1118                 h->sub_mb_type[i8] = sub_mb_type;
1119
1120                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1121                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1122                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1123                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1124
1125                 /* col_zero_flag */
1126                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1127                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1128                                                   && (h->x264_build>33 || !h->x264_build)))){
1129                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1130                     if(IS_SUB_8X8(sub_mb_type)){
1131                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1132                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1133                             if(ref[0] == 0)
1134                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1135                             if(ref[1] == 0)
1136                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1137                         }
1138                     }else
1139                     for(i4=0; i4<4; i4++){
1140                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1141                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1142                             if(ref[0] == 0)
1143                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1144                             if(ref[1] == 0)
1145                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1146                         }
1147                     }
1148                 }
1149             }
1150         }
1151     }else{ /* direct temporal mv pred */
1152         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1153         const int *dist_scale_factor = h->dist_scale_factor;
1154
1155         if(FRAME_MBAFF){
1156             if(IS_INTERLACED(*mb_type)){
1157                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1158                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1159                 dist_scale_factor = h->dist_scale_factor_field;
1160             }
1161             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1162                 /* FIXME assumes direct_8x8_inference == 1 */
1163                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1164                 int mb_types_col[2];
1165                 int y_shift;
1166
1167                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1168                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1169                          | (*mb_type & MB_TYPE_INTERLACED);
1170                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1171
1172                 if(IS_INTERLACED(*mb_type)){
1173                     /* frame to field scaling */
1174                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1175                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1176                     if(s->mb_y&1){
1177                         l1ref0 -= 2*h->b8_stride;
1178                         l1ref1 -= 2*h->b8_stride;
1179                         l1mv0 -= 4*h->b_stride;
1180                         l1mv1 -= 4*h->b_stride;
1181                     }
1182                     y_shift = 0;
1183
1184                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1185                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1186                        && !is_b8x8)
1187                         *mb_type |= MB_TYPE_16x8;
1188                     else
1189                         *mb_type |= MB_TYPE_8x8;
1190                 }else{
1191                     /* field to frame scaling */
1192                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1193                      * but in MBAFF, top and bottom POC are equal */
1194                     int dy = (s->mb_y&1) ? 1 : 2;
1195                     mb_types_col[0] =
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     l1ref0 += dy*h->b8_stride;
1198                     l1ref1 += dy*h->b8_stride;
1199                     l1mv0 += 2*dy*h->b_stride;
1200                     l1mv1 += 2*dy*h->b_stride;
1201                     y_shift = 2;
1202
1203                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1204                        && !is_b8x8)
1205                         *mb_type |= MB_TYPE_16x16;
1206                     else
1207                         *mb_type |= MB_TYPE_8x8;
1208                 }
1209
1210                 for(i8=0; i8<4; i8++){
1211                     const int x8 = i8&1;
1212                     const int y8 = i8>>1;
1213                     int ref0, scale;
1214                     const int16_t (*l1mv)[2]= l1mv0;
1215
1216                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                         continue;
1218                     h->sub_mb_type[i8] = sub_mb_type;
1219
1220                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1221                     if(IS_INTRA(mb_types_col[y8])){
1222                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1223                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         continue;
1226                     }
1227
1228                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1229                     if(ref0 >= 0)
1230                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1231                     else{
1232                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1233                         l1mv= l1mv1;
1234                     }
1235                     scale = dist_scale_factor[ref0];
1236                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237
1238                     {
1239                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1240                         int my_col = (mv_col[1]<<y_shift)/2;
1241                         int mx = (scale * mv_col[0] + 128) >> 8;
1242                         int my = (scale * my_col + 128) >> 8;
1243                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1244                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1245                     }
1246                 }
1247                 return;
1248             }
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col)){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1261                                                 : map_col_to_list0[1][l1ref1[0]];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col)){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * idct tranforms the 16 dc values and dequantize them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * dct tranforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale & 0xff];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         done = 1;
1958
1959         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1960                  &chroma_dc_coeff_token_len [0], 1, 1,
1961                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1962
1963         for(i=0; i<4; i++){
1964             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1965                      &coeff_token_len [i][0], 1, 1,
1966                      &coeff_token_bits[i][0], 1, 1, 1);
1967         }
1968
1969         for(i=0; i<3; i++){
1970             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1971                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1972                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1973         }
1974         for(i=0; i<15; i++){
1975             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1976                      &total_zeros_len [i][0], 1, 1,
1977                      &total_zeros_bits[i][0], 1, 1, 1);
1978         }
1979
1980         for(i=0; i<6; i++){
1981             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1982                      &run_len [i][0], 1, 1,
1983                      &run_bits[i][0], 1, 1, 1);
1984         }
1985         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1986                  &run_len [6][0], 1, 1,
1987                  &run_bits[6][0], 1, 1, 1);
1988     }
1989 }
1990
1991 static void free_tables(H264Context *h){
1992     int i;
1993     H264Context *hx;
1994     av_freep(&h->intra4x4_pred_mode);
1995     av_freep(&h->chroma_pred_mode_table);
1996     av_freep(&h->cbp_table);
1997     av_freep(&h->mvd_table[0]);
1998     av_freep(&h->mvd_table[1]);
1999     av_freep(&h->direct_table);
2000     av_freep(&h->non_zero_count);
2001     av_freep(&h->slice_table_base);
2002     h->slice_table= NULL;
2003
2004     av_freep(&h->mb2b_xy);
2005     av_freep(&h->mb2b8_xy);
2006
2007     for(i = 0; i < MAX_SPS_COUNT; i++)
2008         av_freep(h->sps_buffers + i);
2009
2010     for(i = 0; i < MAX_PPS_COUNT; i++)
2011         av_freep(h->pps_buffers + i);
2012
2013     for(i = 0; i < h->s.avctx->thread_count; i++) {
2014         hx = h->thread_context[i];
2015         if(!hx) continue;
2016         av_freep(&hx->top_borders[1]);
2017         av_freep(&hx->top_borders[0]);
2018         av_freep(&hx->s.obmc_scratchpad);
2019     }
2020 }
2021
2022 static void init_dequant8_coeff_table(H264Context *h){
2023     int i,q,x;
2024     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2025     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2026     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2027
2028     for(i=0; i<2; i++ ){
2029         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2030             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2031             break;
2032         }
2033
2034         for(q=0; q<52; q++){
2035             int shift = ff_div6[q];
2036             int idx = ff_rem6[q];
2037             for(x=0; x<64; x++)
2038                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2039                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2040                     h->pps.scaling_matrix8[i][x]) << shift;
2041         }
2042     }
2043 }
2044
2045 static void init_dequant4_coeff_table(H264Context *h){
2046     int i,j,q,x;
2047     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2048     for(i=0; i<6; i++ ){
2049         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2050         for(j=0; j<i; j++){
2051             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2052                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2053                 break;
2054             }
2055         }
2056         if(j<i)
2057             continue;
2058
2059         for(q=0; q<52; q++){
2060             int shift = ff_div6[q] + 2;
2061             int idx = ff_rem6[q];
2062             for(x=0; x<16; x++)
2063                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2064                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2065                     h->pps.scaling_matrix4[i][x]) << shift;
2066         }
2067     }
2068 }
2069
2070 static void init_dequant_tables(H264Context *h){
2071     int i,x;
2072     init_dequant4_coeff_table(h);
2073     if(h->pps.transform_8x8_mode)
2074         init_dequant8_coeff_table(h);
2075     if(h->sps.transform_bypass){
2076         for(i=0; i<6; i++)
2077             for(x=0; x<16; x++)
2078                 h->dequant4_coeff[i][0][x] = 1<<6;
2079         if(h->pps.transform_8x8_mode)
2080             for(i=0; i<2; i++)
2081                 for(x=0; x<64; x++)
2082                     h->dequant8_coeff[i][0][x] = 1<<6;
2083     }
2084 }
2085
2086
2087 /**
2088  * allocates tables.
2089  * needs width/height
2090  */
2091 static int alloc_tables(H264Context *h){
2092     MpegEncContext * const s = &h->s;
2093     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2094     int x,y;
2095
2096     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2097
2098     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2099     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2101
2102     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2104     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2106
2107     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2108     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2109
2110     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2111     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2112     for(y=0; y<s->mb_height; y++){
2113         for(x=0; x<s->mb_width; x++){
2114             const int mb_xy= x + y*s->mb_stride;
2115             const int b_xy = 4*x + 4*y*h->b_stride;
2116             const int b8_xy= 2*x + 2*y*h->b8_stride;
2117
2118             h->mb2b_xy [mb_xy]= b_xy;
2119             h->mb2b8_xy[mb_xy]= b8_xy;
2120         }
2121     }
2122
2123     s->obmc_scratchpad = NULL;
2124
2125     if(!h->dequant4_coeff[0])
2126         init_dequant_tables(h);
2127
2128     return 0;
2129 fail:
2130     free_tables(h);
2131     return -1;
2132 }
2133
2134 /**
2135  * Mimic alloc_tables(), but for every context thread.
2136  */
2137 static void clone_tables(H264Context *dst, H264Context *src){
2138     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2139     dst->non_zero_count           = src->non_zero_count;
2140     dst->slice_table              = src->slice_table;
2141     dst->cbp_table                = src->cbp_table;
2142     dst->mb2b_xy                  = src->mb2b_xy;
2143     dst->mb2b8_xy                 = src->mb2b8_xy;
2144     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2145     dst->mvd_table[0]             = src->mvd_table[0];
2146     dst->mvd_table[1]             = src->mvd_table[1];
2147     dst->direct_table             = src->direct_table;
2148
2149     dst->s.obmc_scratchpad = NULL;
2150     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2151 }
2152
2153 /**
2154  * Init context
2155  * Allocate buffers which are not shared amongst multiple threads.
2156  */
2157 static int context_init(H264Context *h){
2158     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2159     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160
2161     return 0;
2162 fail:
2163     return -1; // free_tables will clean up for us
2164 }
2165
2166 static av_cold void common_init(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168
2169     s->width = s->avctx->width;
2170     s->height = s->avctx->height;
2171     s->codec_id= s->avctx->codec->id;
2172
2173     ff_h264_pred_init(&h->hpc, s->codec_id);
2174
2175     h->dequant_coeff_pps= -1;
2176     s->unrestricted_mv=1;
2177     s->decode=1; //FIXME
2178
2179     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2180     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     s->low_delay= 1;
2199
2200     if(avctx->codec_id == CODEC_ID_SVQ3)
2201         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2202     else
2203         avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258
2259     // We mark the current picture as non reference after allocating it, so
2260     // that if we break out due to an error it can be released automatically
2261     // in the next MPV_frame_start().
2262     // SVQ3 as well as most other codecs have only last/next/current and thus
2263     // get released even with set reference, besides SVQ3 and others do not
2264     // mark frames as reference later "naturally".
2265     if(s->codec_id != CODEC_ID_SVQ3)
2266         s->current_picture_ptr->reference= 0;
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273
2274     src_y  -=   linesize;
2275     src_cb -= uvlinesize;
2276     src_cr -= uvlinesize;
2277
2278     // There are two lines saved, the line above the the top macroblock of a pair,
2279     // and the line above the bottom macroblock
2280     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2281     for(i=1; i<17; i++){
2282         h->left_border[i]= src_y[15+i*  linesize];
2283     }
2284
2285     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2286     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2287
2288     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2289         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2290         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2291         for(i=1; i<9; i++){
2292             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2293             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2294         }
2295         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2296         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2297     }
2298 }
2299
2300 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2301     MpegEncContext * const s = &h->s;
2302     int temp8, i;
2303     uint64_t temp64;
2304     int deblock_left;
2305     int deblock_top;
2306     int mb_xy;
2307
2308     if(h->deblocking_filter == 2) {
2309         mb_xy = h->mb_xy;
2310         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2311         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2312     } else {
2313         deblock_left = (s->mb_x > 0);
2314         deblock_top =  (s->mb_y > 0);
2315     }
2316
2317     src_y  -=   linesize + 1;
2318     src_cb -= uvlinesize + 1;
2319     src_cr -= uvlinesize + 1;
2320
2321 #define XCHG(a,b,t,xchg)\
2322 t= a;\
2323 if(xchg)\
2324     a= b;\
2325 b= t;
2326
2327     if(deblock_left){
2328         for(i = !deblock_top; i<17; i++){
2329             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2330         }
2331     }
2332
2333     if(deblock_top){
2334         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2335         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2336         if(s->mb_x+1 < s->mb_width){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2338         }
2339     }
2340
2341     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2342         if(deblock_left){
2343             for(i = !deblock_top; i<9; i++){
2344                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2345                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2346             }
2347         }
2348         if(deblock_top){
2349             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2351         }
2352     }
2353 }
2354
2355 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2356     MpegEncContext * const s = &h->s;
2357     int i;
2358
2359     src_y  -= 2 *   linesize;
2360     src_cb -= 2 * uvlinesize;
2361     src_cr -= 2 * uvlinesize;
2362
2363     // There are two lines saved, the line above the the top macroblock of a pair,
2364     // and the line above the bottom macroblock
2365     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2366     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2367     for(i=2; i<34; i++){
2368         h->left_border[i]= src_y[15+i*  linesize];
2369     }
2370
2371     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2372     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2373     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2374     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2375
2376     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2377         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2378         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2379         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2380         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2381         for(i=2; i<18; i++){
2382             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2383             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2384         }
2385         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2386         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2387         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2388         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2389     }
2390 }
2391
2392 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2393     MpegEncContext * const s = &h->s;
2394     int temp8, i;
2395     uint64_t temp64;
2396     int deblock_left = (s->mb_x > 0);
2397     int deblock_top  = (s->mb_y > 1);
2398
2399     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2400
2401     src_y  -= 2 *   linesize + 1;
2402     src_cb -= 2 * uvlinesize + 1;
2403     src_cr -= 2 * uvlinesize + 1;
2404
2405 #define XCHG(a,b,t,xchg)\
2406 t= a;\
2407 if(xchg)\
2408     a= b;\
2409 b= t;
2410
2411     if(deblock_left){
2412         for(i = (!deblock_top)<<1; i<34; i++){
2413             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2414         }
2415     }
2416
2417     if(deblock_top){
2418         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2419         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2420         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2421         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2422         if(s->mb_x+1 < s->mb_width){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2425         }
2426     }
2427
2428     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2429         if(deblock_left){
2430             for(i = (!deblock_top) << 1; i<18; i++){
2431                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2432                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2433             }
2434         }
2435         if(deblock_top){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2438             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2439             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2440         }
2441     }
2442 }
2443
2444 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2445     MpegEncContext * const s = &h->s;
2446     const int mb_x= s->mb_x;
2447     const int mb_y= s->mb_y;
2448     const int mb_xy= h->mb_xy;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450     uint8_t  *dest_y, *dest_cb, *dest_cr;
2451     int linesize, uvlinesize /*dct_offset*/;
2452     int i;
2453     int *block_offset = &h->block_offset[0];
2454     const unsigned int bottom = mb_y & 1;
2455     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2456     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2457     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2458
2459     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2460     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2461     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2462
2463     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2464     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2465
2466     if (!simple && MB_FIELD) {
2467         linesize   = h->mb_linesize   = s->linesize * 2;
2468         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2469         block_offset = &h->block_offset[24];
2470         if(mb_y&1){ //FIXME move out of this func?
2471             dest_y -= s->linesize*15;
2472             dest_cb-= s->uvlinesize*7;
2473             dest_cr-= s->uvlinesize*7;
2474         }
2475         if(FRAME_MBAFF) {
2476             int list;
2477             for(list=0; list<h->list_count; list++){
2478                 if(!USES_LIST(mb_type, list))
2479                     continue;
2480                 if(IS_16X16(mb_type)){
2481                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2482                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2483                 }else{
2484                     for(i=0; i<16; i+=4){
2485                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2486                         int ref = h->ref_cache[list][scan8[i]];
2487                         if(ref >= 0)
2488                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2489                     }
2490                 }
2491             }
2492         }
2493     } else {
2494         linesize   = h->mb_linesize   = s->linesize;
2495         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2496 //        dct_offset = s->linesize * 16;
2497     }
2498
2499     if(transform_bypass){
2500         idct_dc_add =
2501         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2502     }else if(IS_8x8DCT(mb_type)){
2503         idct_dc_add = s->dsp.h264_idct8_dc_add;
2504         idct_add = s->dsp.h264_idct8_add;
2505     }else{
2506         idct_dc_add = s->dsp.h264_idct_dc_add;
2507         idct_add = s->dsp.h264_idct_add;
2508     }
2509
2510     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2511        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2512         int mbt_y = mb_y&~1;
2513         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2514         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2515         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2516         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2517     }
2518
2519     if (!simple && IS_INTRA_PCM(mb_type)) {
2520         unsigned int x, y;
2521
2522         // The pixels are stored in h->mb array in the same order as levels,
2523         // copy them in output in the correct order.
2524         for(i=0; i<16; i++) {
2525             for (y=0; y<4; y++) {
2526                 for (x=0; x<4; x++) {
2527                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2528                 }
2529             }
2530         }
2531         for(i=16; i<16+4; i++) {
2532             for (y=0; y<4; y++) {
2533                 for (x=0; x<4; x++) {
2534                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2535                 }
2536             }
2537         }
2538         for(i=20; i<20+4; i++) {
2539             for (y=0; y<4; y++) {
2540                 for (x=0; x<4; x++) {
2541                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2542                 }
2543             }
2544         }
2545     } else {
2546         if(IS_INTRA(mb_type)){
2547             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2548                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2549
2550             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2551                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2552                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2553             }
2554
2555             if(IS_INTRA4x4(mb_type)){
2556                 if(simple || !s->encoding){
2557                     if(IS_8x8DCT(mb_type)){
2558                         for(i=0; i<16; i+=4){
2559                             uint8_t * const ptr= dest_y + block_offset[i];
2560                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2561                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2562                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2563                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2564                             if(nnz){
2565                                 if(nnz == 1 && h->mb[i*16])
2566                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2567                                 else
2568                                     idct_add(ptr, h->mb + i*16, linesize);
2569                             }
2570                         }
2571                     }else
2572                     for(i=0; i<16; i++){
2573                         uint8_t * const ptr= dest_y + block_offset[i];
2574                         uint8_t *topright;
2575                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2576                         int nnz, tr;
2577
2578                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2579                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2580                             assert(mb_y || linesize <= block_offset[i]);
2581                             if(!topright_avail){
2582                                 tr= ptr[3 - linesize]*0x01010101;
2583                                 topright= (uint8_t*) &tr;
2584                             }else
2585                                 topright= ptr + 4 - linesize;
2586                         }else
2587                             topright= NULL;
2588
2589                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2590                         nnz = h->non_zero_count_cache[ scan8[i] ];
2591                         if(nnz){
2592                             if(is_h264){
2593                                 if(nnz == 1 && h->mb[i*16])
2594                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2595                                 else
2596                                     idct_add(ptr, h->mb + i*16, linesize);
2597                             }else
2598                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2599                         }
2600                     }
2601                 }
2602             }else{
2603                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2604                 if(is_h264){
2605                     if(!transform_bypass)
2606                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2607                 }else
2608                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2609             }
2610             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2611                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2612         }else if(is_h264){
2613             hl_motion(h, dest_y, dest_cb, dest_cr,
2614                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2615                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2616                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2617         }
2618
2619
2620         if(!IS_INTRA4x4(mb_type)){
2621             if(is_h264){
2622                 if(IS_INTRA16x16(mb_type)){
2623                     for(i=0; i<16; i++){
2624                         if(h->non_zero_count_cache[ scan8[i] ])
2625                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2626                         else if(h->mb[i*16])
2627                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2628                     }
2629                 }else{
2630                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2631                     for(i=0; i<16; i+=di){
2632                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2633                         if(nnz){
2634                             if(nnz==1 && h->mb[i*16])
2635                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2636                             else
2637                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2638                         }
2639                     }
2640                 }
2641             }else{
2642                 for(i=0; i<16; i++){
2643                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2644                         uint8_t * const ptr= dest_y + block_offset[i];
2645                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2646                     }
2647                 }
2648             }
2649         }
2650
2651         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2652             uint8_t *dest[2] = {dest_cb, dest_cr};
2653             if(transform_bypass){
2654                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2655             }else{
2656                 idct_add = s->dsp.h264_idct_add;
2657                 idct_dc_add = s->dsp.h264_idct_dc_add;
2658                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2659                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2660             }
2661             if(is_h264){
2662                 for(i=16; i<16+8; i++){
2663                     if(h->non_zero_count_cache[ scan8[i] ])
2664                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2665                     else if(h->mb[i*16])
2666                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2667                 }
2668             }else{
2669                 for(i=16; i<16+8; i++){
2670                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2671                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2672                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2673                     }
2674                 }
2675             }
2676         }
2677     }
2678     if(h->deblocking_filter) {
2679         if (!simple && FRAME_MBAFF) {
2680             //FIXME try deblocking one mb at a time?
2681             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2682             const int mb_y = s->mb_y - 1;
2683             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2684             const int mb_xy= mb_x + mb_y*s->mb_stride;
2685             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2686             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2687             if (!bottom) return;
2688             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2689             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2690             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2691
2692             if(IS_INTRA(mb_type_top | mb_type_bottom))
2693                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2694
2695             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2696             // deblock a pair
2697             // top
2698             s->mb_y--; h->mb_xy -= s->mb_stride;
2699             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2700             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2701             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2702             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2703             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2704             // bottom
2705             s->mb_y++; h->mb_xy += s->mb_stride;
2706             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2707             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2708             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2709             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2710             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2711         } else {
2712             tprintf(h->s.avctx, "call filter_mb\n");
2713             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2714             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2715             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2716         }
2717     }
2718 }
2719
2720 /**
2721  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2722  */
2723 static void hl_decode_mb_simple(H264Context *h){
2724     hl_decode_mb_internal(h, 1);
2725 }
2726
2727 /**
2728  * Process a macroblock; this handles edge cases, such as interlacing.
2729  */
2730 static void av_noinline hl_decode_mb_complex(H264Context *h){
2731     hl_decode_mb_internal(h, 0);
2732 }
2733
2734 static void hl_decode_mb(H264Context *h){
2735     MpegEncContext * const s = &h->s;
2736     const int mb_xy= h->mb_xy;
2737     const int mb_type= s->current_picture.mb_type[mb_xy];
2738     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2739                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2740
2741     if(ENABLE_H264_ENCODER && !s->decode)
2742         return;
2743
2744     if (is_complex)
2745         hl_decode_mb_complex(h);
2746     else hl_decode_mb_simple(h);
2747 }
2748
2749 static void pic_as_field(Picture *pic, const int parity){
2750     int i;
2751     for (i = 0; i < 4; ++i) {
2752         if (parity == PICT_BOTTOM_FIELD)
2753             pic->data[i] += pic->linesize[i];
2754         pic->reference = parity;
2755         pic->linesize[i] *= 2;
2756     }
2757 }
2758
2759 static int split_field_copy(Picture *dest, Picture *src,
2760                             int parity, int id_add){
2761     int match = !!(src->reference & parity);
2762
2763     if (match) {
2764         *dest = *src;
2765         pic_as_field(dest, parity);
2766         dest->pic_id *= 2;
2767         dest->pic_id += id_add;
2768     }
2769
2770     return match;
2771 }
2772
2773 /**
2774  * Split one reference list into field parts, interleaving by parity
2775  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2776  * set to look at the actual start of data for that field.
2777  *
2778  * @param dest output list
2779  * @param dest_len maximum number of fields to put in dest
2780  * @param src the source reference list containing fields and/or field pairs
2781  *            (aka short_ref/long_ref, or
2782  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2783  * @param src_len number of Picture's in source (pairs and unmatched fields)
2784  * @param parity the parity of the picture being decoded/needing
2785  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2786  * @return number of fields placed in dest
2787  */
2788 static int split_field_half_ref_list(Picture *dest, int dest_len,
2789                                      Picture *src,  int src_len,  int parity){
2790     int same_parity   = 1;
2791     int same_i        = 0;
2792     int opp_i         = 0;
2793     int out_i;
2794     int field_output;
2795
2796     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2797         if (same_parity && same_i < src_len) {
2798             field_output = split_field_copy(dest + out_i, src + same_i,
2799                                             parity, 1);
2800             same_parity = !field_output;
2801             same_i++;
2802
2803         } else if (opp_i < src_len) {
2804             field_output = split_field_copy(dest + out_i, src + opp_i,
2805                                             PICT_FRAME - parity, 0);
2806             same_parity = field_output;
2807             opp_i++;
2808
2809         } else {
2810             break;
2811         }
2812     }
2813
2814     return out_i;
2815 }
2816
2817 /**
2818  * Split the reference frame list into a reference field list.
2819  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2820  * The input list contains both reference field pairs and
2821  * unmatched reference fields; it is ordered as spec describes
2822  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2823  * unmatched field pairs are also present. Conceptually this is equivalent
2824  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2825  *
2826  * @param dest output reference list where ordered fields are to be placed
2827  * @param dest_len max number of fields to place at dest
2828  * @param src source reference list, as described above
2829  * @param src_len number of pictures (pairs and unmatched fields) in src
2830  * @param parity parity of field being currently decoded
2831  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2832  * @param long_i index into src array that holds first long reference picture,
2833  *        or src_len if no long refs present.
2834  */
2835 static int split_field_ref_list(Picture *dest, int dest_len,
2836                                 Picture *src,  int src_len,
2837                                 int parity,    int long_i){
2838
2839     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2840     dest += i;
2841     dest_len -= i;
2842
2843     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2844                                    src_len - long_i, parity);
2845     return i;
2846 }
2847
2848 /**
2849  * fills the default_ref_list.
2850  */
2851 static int fill_default_ref_list(H264Context *h){
2852     MpegEncContext * const s = &h->s;
2853     int i;
2854     int smallest_poc_greater_than_current = -1;
2855     int structure_sel;
2856     Picture sorted_short_ref[32];
2857     Picture field_entry_list[2][32];
2858     Picture *frame_list[2];
2859
2860     if (FIELD_PICTURE) {
2861         structure_sel = PICT_FRAME;
2862         frame_list[0] = field_entry_list[0];
2863         frame_list[1] = field_entry_list[1];
2864     } else {
2865         structure_sel = 0;
2866         frame_list[0] = h->default_ref_list[0];
2867         frame_list[1] = h->default_ref_list[1];
2868     }
2869
2870     if(h->slice_type==FF_B_TYPE){
2871         int list;
2872         int len[2];
2873         int short_len[2];
2874         int out_i;
2875         int limit= INT_MIN;
2876
2877         /* sort frame according to poc in B slice */
2878         for(out_i=0; out_i<h->short_ref_count; out_i++){
2879             int best_i=INT_MIN;
2880             int best_poc=INT_MAX;
2881
2882             for(i=0; i<h->short_ref_count; i++){
2883                 const int poc= h->short_ref[i]->poc;
2884                 if(poc > limit && poc < best_poc){
2885                     best_poc= poc;
2886                     best_i= i;
2887                 }
2888             }
2889
2890             assert(best_i != INT_MIN);
2891
2892             limit= best_poc;
2893             sorted_short_ref[out_i]= *h->short_ref[best_i];
2894             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2895             if (-1 == smallest_poc_greater_than_current) {
2896                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2897                     smallest_poc_greater_than_current = out_i;
2898                 }
2899             }
2900         }
2901
2902         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2903
2904         // find the largest poc
2905         for(list=0; list<2; list++){
2906             int index = 0;
2907             int j= -99;
2908             int step= list ? -1 : 1;
2909
2910             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2911                 int sel;
2912                 while(j<0 || j>= h->short_ref_count){
2913                     if(j != -99 && step == (list ? -1 : 1))
2914                         return -1;
2915                     step = -step;
2916                     j= smallest_poc_greater_than_current + (step>>1);
2917                 }
2918                 sel = sorted_short_ref[j].reference | structure_sel;
2919                 if(sel != PICT_FRAME) continue;
2920                 frame_list[list][index  ]= sorted_short_ref[j];
2921                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2922             }
2923             short_len[list] = index;
2924
2925             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2926                 int sel;
2927                 if(h->long_ref[i] == NULL) continue;
2928                 sel = h->long_ref[i]->reference | structure_sel;
2929                 if(sel != PICT_FRAME) continue;
2930
2931                 frame_list[ list ][index  ]= *h->long_ref[i];
2932                 frame_list[ list ][index++].pic_id= i;
2933             }
2934             len[list] = index;
2935         }
2936
2937         for(list=0; list<2; list++){
2938             if (FIELD_PICTURE)
2939                 len[list] = split_field_ref_list(h->default_ref_list[list],
2940                                                  h->ref_count[list],
2941                                                  frame_list[list],
2942                                                  len[list],
2943                                                  s->picture_structure,
2944                                                  short_len[list]);
2945
2946             // swap the two first elements of L1 when L0 and L1 are identical
2947             if(list && len[0] > 1 && len[0] == len[1])
2948                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2949                     if(i == len[0]){
2950                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2951                         break;
2952                     }
2953
2954             if(len[list] < h->ref_count[ list ])
2955                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2956         }
2957
2958
2959     }else{
2960         int index=0;
2961         int short_len;
2962         for(i=0; i<h->short_ref_count; i++){
2963             int sel;
2964             sel = h->short_ref[i]->reference | structure_sel;
2965             if(sel != PICT_FRAME) continue;
2966             frame_list[0][index  ]= *h->short_ref[i];
2967             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2968         }
2969         short_len = index;
2970         for(i = 0; i < 16; i++){
2971             int sel;
2972             if(h->long_ref[i] == NULL) continue;
2973             sel = h->long_ref[i]->reference | structure_sel;
2974             if(sel != PICT_FRAME) continue;
2975             frame_list[0][index  ]= *h->long_ref[i];
2976             frame_list[0][index++].pic_id= i;
2977         }
2978
2979         if (FIELD_PICTURE)
2980             index = split_field_ref_list(h->default_ref_list[0],
2981                                          h->ref_count[0], frame_list[0],
2982                                          index, s->picture_structure,
2983                                          short_len);
2984
2985         if(index < h->ref_count[0])
2986             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2987     }
2988 #ifdef TRACE
2989     for (i=0; i<h->ref_count[0]; i++) {
2990         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2991     }
2992     if(h->slice_type==FF_B_TYPE){
2993         for (i=0; i<h->ref_count[1]; i++) {
2994             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2995         }
2996     }
2997 #endif
2998     return 0;
2999 }
3000
3001 static void print_short_term(H264Context *h);
3002 static void print_long_term(H264Context *h);
3003
3004 /**
3005  * Extract structure information about the picture described by pic_num in
3006  * the current decoding context (frame or field). Note that pic_num is
3007  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3008  * @param pic_num picture number for which to extract structure information
3009  * @param structure one of PICT_XXX describing structure of picture
3010  *                      with pic_num
3011  * @return frame number (short term) or long term index of picture
3012  *         described by pic_num
3013  */
3014 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3015     MpegEncContext * const s = &h->s;
3016
3017     *structure = s->picture_structure;
3018     if(FIELD_PICTURE){
3019         if (!(pic_num & 1))
3020             /* opposite field */
3021             *structure ^= PICT_FRAME;
3022         pic_num >>= 1;
3023     }
3024
3025     return pic_num;
3026 }
3027
3028 static int decode_ref_pic_list_reordering(H264Context *h){
3029     MpegEncContext * const s = &h->s;
3030     int list, index, pic_structure;
3031
3032     print_short_term(h);
3033     print_long_term(h);
3034     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before func
3035
3036     for(list=0; list<h->list_count; list++){
3037         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3038
3039         if(get_bits1(&s->gb)){
3040             int pred= h->curr_pic_num;
3041
3042             for(index=0; ; index++){
3043                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3044                 unsigned int pic_id;
3045                 int i;
3046                 Picture *ref = NULL;
3047
3048                 if(reordering_of_pic_nums_idc==3)
3049                     break;
3050
3051                 if(index >= h->ref_count[list]){
3052                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3053                     return -1;
3054                 }
3055
3056                 if(reordering_of_pic_nums_idc<3){
3057                     if(reordering_of_pic_nums_idc<2){
3058                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3059                         int frame_num;
3060
3061                         if(abs_diff_pic_num > h->max_pic_num){
3062                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3063                             return -1;
3064                         }
3065
3066                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3067                         else                                pred+= abs_diff_pic_num;
3068                         pred &= h->max_pic_num - 1;
3069
3070                         frame_num = pic_num_extract(h, pred, &pic_structure);
3071
3072                         for(i= h->short_ref_count-1; i>=0; i--){
3073                             ref = h->short_ref[i];
3074                             assert(ref->reference);
3075                             assert(!ref->long_ref);
3076                             if(ref->data[0] != NULL &&
3077                                    ref->frame_num == frame_num &&
3078                                    (ref->reference & pic_structure) &&
3079                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3080                                 break;
3081                         }
3082                         if(i>=0)
3083                             ref->pic_id= pred;
3084                     }else{
3085                         int long_idx;
3086                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3087
3088                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3089
3090                         if(long_idx>31){
3091                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3092                             return -1;
3093                         }
3094                         ref = h->long_ref[long_idx];
3095                         assert(!(ref && !ref->reference));
3096                         if(ref && (ref->reference & pic_structure)){
3097                             ref->pic_id= pic_id;
3098                             assert(ref->long_ref);
3099                             i=0;
3100                         }else{
3101                             i=-1;
3102                         }
3103                     }
3104
3105                     if (i < 0) {
3106                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3107                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3108                     } else {
3109                         for(i=index; i+1<h->ref_count[list]; i++){
3110                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3111                                 break;
3112                         }
3113                         for(; i > index; i--){
3114                             h->ref_list[list][i]= h->ref_list[list][i-1];
3115                         }
3116                         h->ref_list[list][index]= *ref;
3117                         if (FIELD_PICTURE){
3118                             pic_as_field(&h->ref_list[list][index], pic_structure);
3119                         }
3120                     }
3121                 }else{
3122                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3123                     return -1;
3124                 }
3125             }
3126         }
3127     }
3128     for(list=0; list<h->list_count; list++){
3129         for(index= 0; index < h->ref_count[list]; index++){
3130             if(!h->ref_list[list][index].data[0])
3131                 h->ref_list[list][index]= s->current_picture;
3132         }
3133     }
3134
3135     if(h->slice_type==FF_B_TYPE && !h->direct_spatial_mv_pred)
3136         direct_dist_scale_factor(h);
3137     direct_ref_list_init(h);
3138     return 0;
3139 }
3140
3141 static void fill_mbaff_ref_list(H264Context *h){
3142     int list, i, j;
3143     for(list=0; list<2; list++){ //FIXME try list_count
3144         for(i=0; i<h->ref_count[list]; i++){
3145             Picture *frame = &h->ref_list[list][i];
3146             Picture *field = &h->ref_list[list][16+2*i];
3147             field[0] = *frame;
3148             for(j=0; j<3; j++)
3149                 field[0].linesize[j] <<= 1;
3150             field[0].reference = PICT_TOP_FIELD;
3151             field[1] = field[0];
3152             for(j=0; j<3; j++)
3153                 field[1].data[j] += frame->linesize[j];
3154             field[1].reference = PICT_BOTTOM_FIELD;
3155
3156             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3157             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3158             for(j=0; j<2; j++){
3159                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3160                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3161             }
3162         }
3163     }
3164     for(j=0; j<h->ref_count[1]; j++){
3165         for(i=0; i<h->ref_count[0]; i++)
3166             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3167         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3168         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3169     }
3170 }
3171
3172 static int pred_weight_table(H264Context *h){
3173     MpegEncContext * const s = &h->s;
3174     int list, i;
3175     int luma_def, chroma_def;
3176
3177     h->use_weight= 0;
3178     h->use_weight_chroma= 0;
3179     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3180     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3181     luma_def = 1<<h->luma_log2_weight_denom;
3182     chroma_def = 1<<h->chroma_log2_weight_denom;
3183
3184     for(list=0; list<2; list++){
3185         for(i=0; i<h->ref_count[list]; i++){
3186             int luma_weight_flag, chroma_weight_flag;
3187
3188             luma_weight_flag= get_bits1(&s->gb);
3189             if(luma_weight_flag){
3190                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3191                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3192                 if(   h->luma_weight[list][i] != luma_def
3193                    || h->luma_offset[list][i] != 0)
3194                     h->use_weight= 1;
3195             }else{
3196                 h->luma_weight[list][i]= luma_def;
3197                 h->luma_offset[list][i]= 0;
3198             }
3199
3200             chroma_weight_flag= get_bits1(&s->gb);
3201             if(chroma_weight_flag){
3202                 int j;
3203                 for(j=0; j<2; j++){
3204                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3205                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3206                     if(   h->chroma_weight[list][i][j] != chroma_def
3207                        || h->chroma_offset[list][i][j] != 0)
3208                         h->use_weight_chroma= 1;
3209                 }
3210             }else{
3211                 int j;
3212                 for(j=0; j<2; j++){
3213                     h->chroma_weight[list][i][j]= chroma_def;
3214                     h->chroma_offset[list][i][j]= 0;
3215                 }
3216             }
3217         }
3218         if(h->slice_type != FF_B_TYPE) break;
3219     }
3220     h->use_weight= h->use_weight || h->use_weight_chroma;
3221     return 0;
3222 }
3223
3224 static void implicit_weight_table(H264Context *h){
3225     MpegEncContext * const s = &h->s;
3226     int ref0, ref1;
3227     int cur_poc = s->current_picture_ptr->poc;
3228
3229     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3230        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3231         h->use_weight= 0;
3232         h->use_weight_chroma= 0;
3233         return;
3234     }
3235
3236     h->use_weight= 2;
3237     h->use_weight_chroma= 2;
3238     h->luma_log2_weight_denom= 5;
3239     h->chroma_log2_weight_denom= 5;
3240
3241     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3242         int poc0 = h->ref_list[0][ref0].poc;
3243         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3244             int poc1 = h->ref_list[1][ref1].poc;
3245             int td = av_clip(poc1 - poc0, -128, 127);
3246             if(td){
3247                 int tb = av_clip(cur_poc - poc0, -128, 127);
3248                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3249                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3250                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3251                     h->implicit_weight[ref0][ref1] = 32;
3252                 else
3253                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3254             }else
3255                 h->implicit_weight[ref0][ref1] = 32;
3256         }
3257     }
3258 }
3259
3260 /**
3261  * Mark a picture as no longer needed for reference. The refmask
3262  * argument allows unreferencing of individual fields or the whole frame.
3263  * If the picture becomes entirely unreferenced, but is being held for
3264  * display purposes, it is marked as such.
3265  * @param refmask mask of fields to unreference; the mask is bitwise
3266  *                anded with the reference marking of pic
3267  * @return non-zero if pic becomes entirely unreferenced (except possibly
3268  *         for display purposes) zero if one of the fields remains in
3269  *         reference
3270  */
3271 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3272     int i;
3273     if (pic->reference &= refmask) {
3274         return 0;
3275     } else {
3276         for(i = 0; h->delayed_pic[i]; i++)
3277             if(pic == h->delayed_pic[i]){
3278                 pic->reference=DELAYED_PIC_REF;
3279                 break;
3280             }
3281         return 1;
3282     }
3283 }
3284
3285 /**
3286  * instantaneous decoder refresh.
3287  */
3288 static void idr(H264Context *h){
3289     int i;
3290
3291     for(i=0; i<16; i++){
3292         if (h->long_ref[i] != NULL) {
3293             unreference_pic(h, h->long_ref[i], 0);
3294             h->long_ref[i]= NULL;
3295         }
3296     }
3297     h->long_ref_count=0;
3298
3299     for(i=0; i<h->short_ref_count; i++){
3300         unreference_pic(h, h->short_ref[i], 0);
3301         h->short_ref[i]= NULL;
3302     }
3303     h->short_ref_count=0;
3304 }
3305
3306 /* forget old pics after a seek */
3307 static void flush_dpb(AVCodecContext *avctx){
3308     H264Context *h= avctx->priv_data;
3309     int i;
3310     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3311         if(h->delayed_pic[i])
3312             h->delayed_pic[i]->reference= 0;
3313         h->delayed_pic[i]= NULL;
3314     }
3315     h->outputed_poc= INT_MIN;
3316     idr(h);
3317     if(h->s.current_picture_ptr)
3318         h->s.current_picture_ptr->reference= 0;
3319     h->s.first_field= 0;
3320     ff_mpeg_flush(avctx);
3321 }
3322
3323 /**
3324  * Find a Picture in the short term reference list by frame number.
3325  * @param frame_num frame number to search for
3326  * @param idx the index into h->short_ref where returned picture is found
3327  *            undefined if no picture found.
3328  * @return pointer to the found picture, or NULL if no pic with the provided
3329  *                 frame number is found
3330  */
3331 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3332     MpegEncContext * const s = &h->s;
3333     int i;
3334
3335     for(i=0; i<h->short_ref_count; i++){
3336         Picture *pic= h->short_ref[i];
3337         if(s->avctx->debug&FF_DEBUG_MMCO)
3338             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3339         if(pic->frame_num == frame_num) {
3340             *idx = i;
3341             return pic;
3342         }
3343     }
3344     return NULL;
3345 }
3346
3347 /**
3348  * Remove a picture from the short term reference list by its index in
3349  * that list.  This does no checking on the provided index; it is assumed
3350  * to be valid. Other list entries are shifted down.
3351  * @param i index into h->short_ref of picture to remove.
3352  */
3353 static void remove_short_at_index(H264Context *h, int i){
3354     assert(i >= 0 && i < h->short_ref_count);
3355     h->short_ref[i]= NULL;
3356     if (--h->short_ref_count)
3357         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3358 }
3359
3360 /**
3361  *
3362  * @return the removed picture or NULL if an error occurs
3363  */
3364 static Picture * remove_short(H264Context *h, int frame_num){
3365     MpegEncContext * const s = &h->s;
3366     Picture *pic;
3367     int i;
3368
3369     if(s->avctx->debug&FF_DEBUG_MMCO)
3370         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3371
3372     pic = find_short(h, frame_num, &i);
3373     if (pic)
3374         remove_short_at_index(h, i);
3375
3376     return pic;
3377 }
3378
3379 /**
3380  * Remove a picture from the long term reference list by its index in
3381  * that list.  This does no checking on the provided index; it is assumed
3382  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3383  * @param i index into h->long_ref of picture to remove.
3384  */
3385 static void remove_long_at_index(H264Context *h, int i){
3386     h->long_ref[i]= NULL;
3387     h->long_ref_count--;
3388 }
3389
3390 /**
3391  *
3392  * @return the removed picture or NULL if an error occurs
3393  */
3394 static Picture * remove_long(H264Context *h, int i){
3395     Picture *pic;
3396
3397     pic= h->long_ref[i];
3398     if (pic)
3399         remove_long_at_index(h, i);
3400
3401     return pic;
3402 }
3403
3404 /**
3405  * print short term list
3406  */
3407 static void print_short_term(H264Context *h) {
3408     uint32_t i;
3409     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3410         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3411         for(i=0; i<h->short_ref_count; i++){
3412             Picture *pic= h->short_ref[i];
3413             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3414         }
3415     }
3416 }
3417
3418 /**
3419  * print long term list
3420  */
3421 static void print_long_term(H264Context *h) {
3422     uint32_t i;
3423     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3424         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3425         for(i = 0; i < 16; i++){
3426             Picture *pic= h->long_ref[i];
3427             if (pic) {
3428                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3429             }
3430         }
3431     }
3432 }
3433
3434 /**
3435  * Executes the reference picture marking (memory management control operations).
3436  */
3437 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3438     MpegEncContext * const s = &h->s;
3439     int i, j;
3440     int current_ref_assigned=0;
3441     Picture *pic;
3442
3443     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3444         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3445
3446     for(i=0; i<mmco_count; i++){
3447         int structure, frame_num, unref_pic;
3448         if(s->avctx->debug&FF_DEBUG_MMCO)
3449             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3450
3451         switch(mmco[i].opcode){
3452         case MMCO_SHORT2UNUSED:
3453             if(s->avctx->debug&FF_DEBUG_MMCO)
3454                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3455             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3456             pic = find_short(h, frame_num, &j);
3457             if (pic) {
3458                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3459                     remove_short_at_index(h, j);
3460             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3461                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3462             break;
3463         case MMCO_SHORT2LONG:
3464             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3465                     h->long_ref[mmco[i].long_arg]->frame_num ==
3466                                               mmco[i].short_pic_num / 2) {
3467                 /* do nothing, we've already moved this field pair. */
3468             } else {
3469                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3470
3471                 pic= remove_long(h, mmco[i].long_arg);
3472                 if(pic) unreference_pic(h, pic, 0);
3473
3474                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3475                 if (h->long_ref[ mmco[i].long_arg ]){
3476                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3477                     h->long_ref_count++;
3478                 }
3479             }
3480             break;
3481         case MMCO_LONG2UNUSED:
3482             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3483             pic = h->long_ref[j];
3484             if (pic) {
3485                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3486                     remove_long_at_index(h, j);
3487             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3488                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3489             break;
3490         case MMCO_LONG:
3491             unref_pic = 1;
3492             if (FIELD_PICTURE && !s->first_field) {
3493                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3494                     /* Just mark second field as referenced */
3495                     unref_pic = 0;
3496                 } else if (s->current_picture_ptr->reference) {
3497                     /* First field in pair is in short term list or
3498                      * at a different long term index.
3499                      * This is not allowed; see 7.4.3, notes 2 and 3.
3500                      * Report the problem and keep the pair where it is,
3501                      * and mark this field valid.
3502                      */
3503                     av_log(h->s.avctx, AV_LOG_ERROR,
3504                         "illegal long term reference assignment for second "
3505                         "field in complementary field pair (first field is "
3506                         "short term or has non-matching long index)\n");
3507                     unref_pic = 0;
3508                 }
3509             }
3510
3511             if (unref_pic) {
3512                 pic= remove_long(h, mmco[i].long_arg);
3513                 if(pic) unreference_pic(h, pic, 0);
3514
3515                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3516                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3517                 h->long_ref_count++;
3518             }
3519
3520             s->current_picture_ptr->reference |= s->picture_structure;
3521             current_ref_assigned=1;
3522             break;
3523         case MMCO_SET_MAX_LONG:
3524             assert(mmco[i].long_arg <= 16);
3525             // just remove the long term which index is greater than new max
3526             for(j = mmco[i].long_arg; j<16; j++){
3527                 pic = remove_long(h, j);
3528                 if (pic) unreference_pic(h, pic, 0);
3529             }
3530             break;
3531         case MMCO_RESET:
3532             while(h->short_ref_count){
3533                 pic= remove_short(h, h->short_ref[0]->frame_num);
3534                 if(pic) unreference_pic(h, pic, 0);
3535             }
3536             for(j = 0; j < 16; j++) {
3537                 pic= remove_long(h, j);
3538                 if(pic) unreference_pic(h, pic, 0);
3539             }
3540             break;
3541         default: assert(0);
3542         }
3543     }
3544
3545     if (!current_ref_assigned && FIELD_PICTURE &&
3546             !s->first_field && s->current_picture_ptr->reference) {
3547
3548         /* Second field of complementary field pair; the first field of
3549          * which is already referenced. If short referenced, it
3550          * should be first entry in short_ref. If not, it must exist
3551          * in long_ref; trying to put it on the short list here is an
3552          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3553          */
3554         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3555             /* Just mark the second field valid */
3556             s->current_picture_ptr->reference = PICT_FRAME;
3557         } else if (s->current_picture_ptr->long_ref) {
3558             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3559                                              "assignment for second field "
3560                                              "in complementary field pair "
3561                                              "(first field is long term)\n");
3562         } else {
3563             /*
3564              * First field in reference, but not in any sensible place on our
3565              * reference lists. This shouldn't happen unless reference
3566              * handling somewhere else is wrong.
3567              */
3568             assert(0);
3569         }
3570         current_ref_assigned = 1;
3571     }
3572
3573     if(!current_ref_assigned){
3574         pic= remove_short(h, s->current_picture_ptr->frame_num);
3575         if(pic){
3576             unreference_pic(h, pic, 0);
3577             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3578         }
3579
3580         if(h->short_ref_count)
3581             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3582
3583         h->short_ref[0]= s->current_picture_ptr;
3584         h->short_ref[0]->long_ref=0;
3585         h->short_ref_count++;
3586         s->current_picture_ptr->reference |= s->picture_structure;
3587     }
3588
3589     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3590
3591         /* We have too many reference frames, probably due to corrupted
3592          * stream. Need to discard one frame. Prevents overrun of the
3593          * short_ref and long_ref buffers.
3594          */
3595         av_log(h->s.avctx, AV_LOG_ERROR,
3596                "number of reference frames exceeds max (probably "
3597                "corrupt input), discarding one\n");
3598
3599         if (h->long_ref_count) {
3600             for (i = 0; i < 16; ++i)
3601                 if (h->long_ref[i])
3602                     break;
3603
3604             assert(i < 16);
3605             pic = h->long_ref[i];
3606             remove_long_at_index(h, i);
3607         } else {
3608             pic = h->short_ref[h->short_ref_count - 1];
3609             remove_short_at_index(h, h->short_ref_count - 1);
3610         }
3611         unreference_pic(h, pic, 0);
3612     }
3613
3614     print_short_term(h);
3615     print_long_term(h);
3616     return 0;
3617 }
3618
3619 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3620     MpegEncContext * const s = &h->s;
3621     int i;
3622
3623     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3624         s->broken_link= get_bits1(gb) -1;
3625         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3626         if(h->mmco[0].long_arg == -1)
3627             h->mmco_index= 0;
3628         else{
3629             h->mmco[0].opcode= MMCO_LONG;
3630             h->mmco_index= 1;
3631         }
3632     }else{
3633         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3634             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3635                 MMCOOpcode opcode= get_ue_golomb(gb);
3636
3637                 h->mmco[i].opcode= opcode;
3638                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3639                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3640 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3641                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3642                         return -1;
3643                     }*/
3644                 }
3645                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3646                     unsigned int long_arg= get_ue_golomb(gb);
3647                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3648                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3649                         return -1;
3650                     }
3651                     h->mmco[i].long_arg= long_arg;
3652                 }
3653
3654                 if(opcode > (unsigned)MMCO_LONG){
3655                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3656                     return -1;
3657                 }
3658                 if(opcode == MMCO_END)
3659                     break;
3660             }
3661             h->mmco_index= i;
3662         }else{
3663             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3664
3665             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3666                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3667                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3668                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3669                 h->mmco_index= 1;
3670                 if (FIELD_PICTURE) {
3671                     h->mmco[0].short_pic_num *= 2;
3672                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3673                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3674                     h->mmco_index= 2;
3675                 }
3676             }else
3677                 h->mmco_index= 0;
3678         }
3679     }
3680
3681     return 0;
3682 }
3683
3684 static int init_poc(H264Context *h){
3685     MpegEncContext * const s = &h->s;
3686     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3687     int field_poc[2];
3688
3689     if(h->nal_unit_type == NAL_IDR_SLICE){
3690         h->frame_num_offset= 0;
3691     }else{
3692         if(h->frame_num < h->prev_frame_num)
3693             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3694         else
3695             h->frame_num_offset= h->prev_frame_num_offset;
3696     }
3697
3698     if(h->sps.poc_type==0){
3699         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3700
3701         if(h->nal_unit_type == NAL_IDR_SLICE){
3702              h->prev_poc_msb=
3703              h->prev_poc_lsb= 0;
3704         }
3705
3706         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3707             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3708         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3709             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3710         else
3711             h->poc_msb = h->prev_poc_msb;
3712 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3713         field_poc[0] =
3714         field_poc[1] = h->poc_msb + h->poc_lsb;
3715         if(s->picture_structure == PICT_FRAME)
3716             field_poc[1] += h->delta_poc_bottom;
3717     }else if(h->sps.poc_type==1){
3718         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3719         int i;
3720
3721         if(h->sps.poc_cycle_length != 0)
3722             abs_frame_num = h->frame_num_offset + h->frame_num;
3723         else
3724             abs_frame_num = 0;
3725
3726         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3727             abs_frame_num--;
3728
3729         expected_delta_per_poc_cycle = 0;
3730         for(i=0; i < h->sps.poc_cycle_length; i++)
3731             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3732
3733         if(abs_frame_num > 0){
3734             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3735             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3736
3737             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3738             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3739                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3740         } else
3741             expectedpoc = 0;
3742
3743         if(h->nal_ref_idc == 0)
3744             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3745
3746         field_poc[0] = expectedpoc + h->delta_poc[0];
3747         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3748
3749         if(s->picture_structure == PICT_FRAME)
3750             field_poc[1] += h->delta_poc[1];
3751     }else{
3752         int poc;
3753         if(h->nal_unit_type == NAL_IDR_SLICE){
3754             poc= 0;
3755         }else{
3756             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3757             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3758         }
3759         field_poc[0]= poc;
3760         field_poc[1]= poc;
3761     }
3762
3763     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3764         s->current_picture_ptr->field_poc[0]= field_poc[0];
3765         s->current_picture_ptr->poc = field_poc[0];
3766     }
3767     if(s->picture_structure != PICT_TOP_FIELD) {
3768         s->current_picture_ptr->field_poc[1]= field_poc[1];
3769         s->current_picture_ptr->poc = field_poc[1];
3770     }
3771     if(!FIELD_PICTURE || !s->first_field) {
3772         Picture *cur = s->current_picture_ptr;
3773         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3774     }
3775
3776     return 0;
3777 }
3778
3779
3780 /**
3781  * initialize scan tables
3782  */
3783 static void init_scan_tables(H264Context *h){
3784     MpegEncContext * const s = &h->s;
3785     int i;
3786     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3787         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3788         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3789     }else{
3790         for(i=0; i<16; i++){
3791 #define T(x) (x>>2) | ((x<<2) & 0xF)
3792             h->zigzag_scan[i] = T(zigzag_scan[i]);
3793             h-> field_scan[i] = T( field_scan[i]);
3794 #undef T
3795         }
3796     }
3797     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3798         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3799         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3800         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3801         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3802     }else{
3803         for(i=0; i<64; i++){
3804 #define T(x) (x>>3) | ((x&7)<<3)
3805             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3806             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3807             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3808             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3809 #undef T
3810         }
3811     }
3812     if(h->sps.transform_bypass){ //FIXME same ugly
3813         h->zigzag_scan_q0          = zigzag_scan;
3814         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3815         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3816         h->field_scan_q0           = field_scan;
3817         h->field_scan8x8_q0        = field_scan8x8;
3818         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3819     }else{
3820         h->zigzag_scan_q0          = h->zigzag_scan;
3821         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3822         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3823         h->field_scan_q0           = h->field_scan;
3824         h->field_scan8x8_q0        = h->field_scan8x8;
3825         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3826     }
3827 }
3828
3829 /**
3830  * Replicates H264 "master" context to thread contexts.
3831  */
3832 static void clone_slice(H264Context *dst, H264Context *src)
3833 {
3834     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3835     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3836     dst->s.current_picture      = src->s.current_picture;
3837     dst->s.linesize             = src->s.linesize;
3838     dst->s.uvlinesize           = src->s.uvlinesize;
3839     dst->s.first_field          = src->s.first_field;
3840
3841     dst->prev_poc_msb           = src->prev_poc_msb;
3842     dst->prev_poc_lsb           = src->prev_poc_lsb;
3843     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3844     dst->prev_frame_num         = src->prev_frame_num;
3845     dst->short_ref_count        = src->short_ref_count;
3846
3847     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3848     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3849     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3850     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3851
3852     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3853     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3854 }
3855
3856 /**
3857  * decodes a slice header.
3858  * This will also call MPV_common_init() and frame_start() as needed.
3859  *
3860  * @param h h264context
3861  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3862  *
3863  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3864  */
3865 static int decode_slice_header(H264Context *h, H264Context *h0){
3866     MpegEncContext * const s = &h->s;
3867     MpegEncContext * const s0 = &h0->s;
3868     unsigned int first_mb_in_slice;
3869     unsigned int pps_id;
3870     int num_ref_idx_active_override_flag;
3871     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3872     unsigned int slice_type, tmp, i, j;
3873     int default_ref_list_done = 0;
3874     int last_pic_structure;
3875
3876     s->dropable= h->nal_ref_idc == 0;
3877
3878     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3879         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3880         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3881     }else{
3882         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3883         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3884     }
3885
3886     first_mb_in_slice= get_ue_golomb(&s->gb);
3887
3888     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3889         h0->current_slice = 0;
3890         if (!s0->first_field)
3891             s->current_picture_ptr= NULL;
3892     }
3893
3894     slice_type= get_ue_golomb(&s->gb);
3895     if(slice_type > 9){
3896         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3897         return -1;
3898     }
3899     if(slice_type > 4){
3900         slice_type -= 5;
3901         h->slice_type_fixed=1;
3902     }else
3903         h->slice_type_fixed=0;
3904
3905     slice_type= slice_type_map[ slice_type ];
3906     if (slice_type == FF_I_TYPE
3907         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3908         default_ref_list_done = 1;
3909     }
3910     h->slice_type= slice_type;
3911     h->slice_type_nos= slice_type & 3;
3912
3913     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3914     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3915         av_log(h->s.avctx, AV_LOG_ERROR,
3916                "B picture before any references, skipping\n");
3917         return -1;
3918     }
3919
3920     pps_id= get_ue_golomb(&s->gb);
3921     if(pps_id>=MAX_PPS_COUNT){
3922         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3923         return -1;
3924     }
3925     if(!h0->pps_buffers[pps_id]) {
3926         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3927         return -1;
3928     }
3929     h->pps= *h0->pps_buffers[pps_id];
3930
3931     if(!h0->sps_buffers[h->pps.sps_id]) {
3932         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3933         return -1;
3934     }
3935     h->sps = *h0->sps_buffers[h->pps.sps_id];
3936
3937     if(h == h0 && h->dequant_coeff_pps != pps_id){
3938         h->dequant_coeff_pps = pps_id;
3939         init_dequant_tables(h);
3940     }
3941
3942     s->mb_width= h->sps.mb_width;
3943     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3944
3945     h->b_stride=  s->mb_width*4;
3946     h->b8_stride= s->mb_width*2;
3947
3948     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3949     if(h->sps.frame_mbs_only_flag)
3950         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3951     else
3952         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3953
3954     if (s->context_initialized
3955         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3956         if(h != h0)
3957             return -1;   // width / height changed during parallelized decoding
3958         free_tables(h);
3959         MPV_common_end(s);
3960     }
3961     if (!s->context_initialized) {
3962         if(h != h0)
3963             return -1;  // we cant (re-)initialize context during parallel decoding
3964         if (MPV_common_init(s) < 0)
3965             return -1;
3966         s->first_field = 0;
3967
3968         init_scan_tables(h);
3969         alloc_tables(h);
3970
3971         for(i = 1; i < s->avctx->thread_count; i++) {
3972             H264Context *c;
3973             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3974             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3975             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3976             c->sps = h->sps;
3977             c->pps = h->pps;
3978             init_scan_tables(c);
3979             clone_tables(c, h);
3980         }
3981
3982         for(i = 0; i < s->avctx->thread_count; i++)
3983             if(context_init(h->thread_context[i]) < 0)
3984                 return -1;
3985
3986         s->avctx->width = s->width;
3987         s->avctx->height = s->height;
3988         s->avctx->sample_aspect_ratio= h->sps.sar;
3989         if(!s->avctx->sample_aspect_ratio.den)
3990             s->avctx->sample_aspect_ratio.den = 1;
3991
3992         if(h->sps.timing_info_present_flag){
3993             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3994             if(h->x264_build > 0 && h->x264_build < 44)
3995                 s->avctx->time_base.den *= 2;
3996             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3997                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3998         }
3999     }
4000
4001     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4002
4003     h->mb_mbaff = 0;
4004     h->mb_aff_frame = 0;
4005     last_pic_structure = s0->picture_structure;
4006     if(h->sps.frame_mbs_only_flag){
4007         s->picture_structure= PICT_FRAME;
4008     }else{
4009         if(get_bits1(&s->gb)) { //field_pic_flag
4010             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4011         } else {
4012             s->picture_structure= PICT_FRAME;
4013             h->mb_aff_frame = h->sps.mb_aff;
4014         }
4015     }
4016
4017     if(h0->current_slice == 0){
4018         /* See if we have a decoded first field looking for a pair... */
4019         if (s0->first_field) {
4020             assert(s0->current_picture_ptr);
4021             assert(s0->current_picture_ptr->data[0]);
4022             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4023
4024             /* figure out if we have a complementary field pair */
4025             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4026                 /*
4027                  * Previous field is unmatched. Don't display it, but let it
4028                  * remain for reference if marked as such.
4029                  */
4030                 s0->current_picture_ptr = NULL;
4031                 s0->first_field = FIELD_PICTURE;
4032
4033             } else {
4034                 if (h->nal_ref_idc &&
4035                         s0->current_picture_ptr->reference &&
4036                         s0->current_picture_ptr->frame_num != h->frame_num) {
4037                     /*
4038                      * This and previous field were reference, but had
4039                      * different frame_nums. Consider this field first in
4040                      * pair. Throw away previous field except for reference
4041                      * purposes.
4042                      */
4043                     s0->first_field = 1;
4044                     s0->current_picture_ptr = NULL;
4045
4046                 } else {
4047                     /* Second field in complementary pair */
4048                     s0->first_field = 0;
4049                 }
4050             }
4051
4052         } else {
4053             /* Frame or first field in a potentially complementary pair */
4054             assert(!s0->current_picture_ptr);
4055             s0->first_field = FIELD_PICTURE;
4056         }
4057
4058         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4059             s0->first_field = 0;
4060             return -1;
4061         }
4062     }
4063     if(h != h0)
4064         clone_slice(h, h0);
4065
4066     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4067
4068     assert(s->mb_num == s->mb_width * s->mb_height);
4069     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4070        first_mb_in_slice                    >= s->mb_num){
4071         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4072         return -1;
4073     }
4074     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4075     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4076     if (s->picture_structure == PICT_BOTTOM_FIELD)
4077         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4078     assert(s->mb_y < s->mb_height);
4079
4080     if(s->picture_structure==PICT_FRAME){
4081         h->curr_pic_num=   h->frame_num;
4082         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4083     }else{
4084         h->curr_pic_num= 2*h->frame_num + 1;
4085         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4086     }
4087
4088     if(h->nal_unit_type == NAL_IDR_SLICE){
4089         get_ue_golomb(&s->gb); /* idr_pic_id */
4090     }
4091
4092     if(h->sps.poc_type==0){
4093         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4094
4095         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4096             h->delta_poc_bottom= get_se_golomb(&s->gb);
4097         }
4098     }
4099
4100     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4101         h->delta_poc[0]= get_se_golomb(&s->gb);
4102
4103         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4104             h->delta_poc[1]= get_se_golomb(&s->gb);
4105     }
4106
4107     init_poc(h);
4108
4109     if(h->pps.redundant_pic_cnt_present){
4110         h->redundant_pic_count= get_ue_golomb(&s->gb);
4111     }
4112
4113     //set defaults, might be overriden a few line later
4114     h->ref_count[0]= h->pps.ref_count[0];
4115     h->ref_count[1]= h->pps.ref_count[1];
4116
4117     if(h->slice_type_nos != FF_I_TYPE){
4118         if(h->slice_type == FF_B_TYPE){
4119             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4120             if(FIELD_PICTURE && h->direct_spatial_mv_pred)
4121                 av_log(h->s.avctx, AV_LOG_ERROR, "PAFF + spatial direct mode is not implemented\n");
4122         }
4123         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4124
4125         if(num_ref_idx_active_override_flag){
4126             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4127             if(h->slice_type==FF_B_TYPE)
4128                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4129
4130             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4131                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4132                 h->ref_count[0]= h->ref_count[1]= 1;
4133                 return -1;
4134             }
4135         }
4136         if(h->slice_type == FF_B_TYPE)
4137             h->list_count= 2;
4138         else
4139             h->list_count= 1;
4140     }else
4141         h->list_count= 0;
4142
4143     if(!default_ref_list_done){
4144         fill_default_ref_list(h);
4145     }
4146
4147     if(decode_ref_pic_list_reordering(h) < 0)
4148         return -1;
4149
4150     if(   (h->pps.weighted_pred          && (h->slice_type_nos == FF_P_TYPE ))
4151        || (h->pps.weighted_bipred_idc==1 && h->slice_type==FF_B_TYPE ) )
4152         pred_weight_table(h);
4153     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==FF_B_TYPE)
4154         implicit_weight_table(h);
4155     else
4156         h->use_weight = 0;
4157
4158     if(h->nal_ref_idc)
4159         decode_ref_pic_marking(h0, &s->gb);
4160
4161     if(FRAME_MBAFF)
4162         fill_mbaff_ref_list(h);
4163
4164      h->ref2frm[1][0]= h->ref2frm[1][1]= -1;
4165      for(j=0; j<2; j++){
4166          h->ref2frm[j][  0]=
4167          h->ref2frm[j][  1]= -1;
4168          for(i=0; i<48; i++)
4169              h->ref2frm[j][i+2]= 4*h->ref_list[j][i].frame_num
4170                                  +(h->ref_list[j][i].reference&3);
4171      }
4172
4173     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4174         tmp = get_ue_golomb(&s->gb);
4175         if(tmp > 2){
4176             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4177             return -1;
4178         }
4179         h->cabac_init_idc= tmp;
4180     }
4181
4182     h->last_qscale_diff = 0;
4183     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4184     if(tmp>51){
4185         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4186         return -1;
4187     }
4188     s->qscale= tmp;
4189     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4190     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4191     //FIXME qscale / qp ... stuff
4192     if(h->slice_type == FF_SP_TYPE){
4193         get_bits1(&s->gb); /* sp_for_switch_flag */
4194     }
4195     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4196         get_se_golomb(&s->gb); /* slice_qs_delta */
4197     }
4198
4199     h->deblocking_filter = 1;
4200     h->slice_alpha_c0_offset = 0;
4201     h->slice_beta_offset = 0;
4202     if( h->pps.deblocking_filter_parameters_present ) {
4203         tmp= get_ue_golomb(&s->gb);
4204         if(tmp > 2){
4205             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4206             return -1;
4207         }
4208         h->deblocking_filter= tmp;
4209         if(h->deblocking_filter < 2)
4210             h->deblocking_filter^= 1; // 1<->0
4211
4212         if( h->deblocking_filter ) {
4213             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4214             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4215         }
4216     }
4217
4218     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4219        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != FF_I_TYPE)
4220        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == FF_B_TYPE)
4221        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4222         h->deblocking_filter= 0;
4223
4224     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4225         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4226             /* Cheat slightly for speed:
4227                Do not bother to deblock across slices. */
4228             h->deblocking_filter = 2;
4229         } else {
4230             h0->max_contexts = 1;
4231             if(!h0->single_decode_warning) {
4232                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4233                 h0->single_decode_warning = 1;
4234             }
4235             if(h != h0)
4236                 return 1; // deblocking switched inside frame
4237         }
4238     }
4239
4240 #if 0 //FMO
4241     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4242         slice_group_change_cycle= get_bits(&s->gb, ?);
4243 #endif
4244
4245     h0->last_slice_type = slice_type;
4246     h->slice_num = ++h0->current_slice;
4247
4248     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4249     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4250
4251     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4252         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4253                h->slice_num,
4254                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4255                first_mb_in_slice,
4256                av_get_pict_type_char(h->slice_type),
4257                pps_id, h->frame_num,
4258                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4259                h->ref_count[0], h->ref_count[1],
4260                s->qscale,
4261                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4262                h->use_weight,
4263                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4264                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4265                );
4266     }
4267
4268     return 0;
4269 }
4270
4271 /**
4272  *
4273  */
4274 static inline int get_level_prefix(GetBitContext *gb){
4275     unsigned int buf;
4276     int log;
4277
4278     OPEN_READER(re, gb);
4279     UPDATE_CACHE(re, gb);
4280     buf=GET_CACHE(re, gb);
4281
4282     log= 32 - av_log2(buf);
4283 #ifdef TRACE
4284     print_bin(buf>>(32-log), log);
4285     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4286 #endif
4287
4288     LAST_SKIP_BITS(re, gb, log);
4289     CLOSE_READER(re, gb);
4290
4291     return log-1;
4292 }
4293
4294 static inline int get_dct8x8_allowed(H264Context *h){
4295     int i;
4296     for(i=0; i<4; i++){
4297         if(!IS_SUB_8X8(h->sub_mb_type[i])
4298            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4299             return 0;
4300     }
4301     return 1;
4302 }
4303
4304 /**
4305  * decodes a residual block.
4306  * @param n block index
4307  * @param scantable scantable
4308  * @param max_coeff number of coefficients in the block
4309  * @return <0 if an error occurred
4310  */
4311 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4312     MpegEncContext * const s = &h->s;
4313     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4314     int level[16];
4315     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4316
4317     //FIXME put trailing_onex into the context
4318
4319     if(n == CHROMA_DC_BLOCK_INDEX){
4320         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4321         total_coeff= coeff_token>>2;
4322     }else{
4323         if(n == LUMA_DC_BLOCK_INDEX){
4324             total_coeff= pred_non_zero_count(h, 0);
4325             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4326             total_coeff= coeff_token>>2;
4327         }else{
4328             total_coeff= pred_non_zero_count(h, n);
4329             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4330             total_coeff= coeff_token>>2;
4331             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4332         }
4333     }
4334
4335     //FIXME set last_non_zero?
4336
4337     if(total_coeff==0)
4338         return 0;
4339     if(total_coeff > (unsigned)max_coeff) {
4340         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4341         return -1;
4342     }
4343
4344     trailing_ones= coeff_token&3;
4345     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4346     assert(total_coeff<=16);
4347
4348     for(i=0; i<trailing_ones; i++){
4349         level[i]= 1 - 2*get_bits1(gb);
4350     }
4351
4352     if(i<total_coeff) {
4353         int level_code, mask;
4354         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4355         int prefix= get_level_prefix(gb);
4356
4357         //first coefficient has suffix_length equal to 0 or 1
4358         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4359             if(suffix_length)
4360                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4361             else
4362                 level_code= (prefix<<suffix_length); //part
4363         }else if(prefix==14){
4364             if(suffix_length)
4365                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4366             else
4367                 level_code= prefix + get_bits(gb, 4); //part
4368         }else{
4369             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4370             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4371             if(prefix>=16)
4372                 level_code += (1<<(prefix-3))-4096;
4373         }
4374
4375         if(trailing_ones < 3) level_code += 2;
4376
4377         suffix_length = 1;
4378         if(level_code > 5)
4379             suffix_length++;
4380         mask= -(level_code&1);
4381         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4382         i++;
4383
4384         //remaining coefficients have suffix_length > 0
4385         for(;i<total_coeff;i++) {
4386             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4387             prefix = get_level_prefix(gb);
4388             if(prefix<15){
4389                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4390             }else{
4391                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4392                 if(prefix>=16)
4393                     level_code += (1<<(prefix-3))-4096;
4394             }
4395             mask= -(level_code&1);
4396             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4397             if(level_code > suffix_limit[suffix_length])
4398                 suffix_length++;
4399         }
4400     }
4401
4402     if(total_coeff == max_coeff)
4403         zeros_left=0;
4404     else{
4405         if(n == CHROMA_DC_BLOCK_INDEX)
4406             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4407         else
4408             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4409     }
4410
4411     coeff_num = zeros_left + total_coeff - 1;
4412     j = scantable[coeff_num];
4413     if(n > 24){
4414         block[j] = level[0];
4415         for(i=1;i<total_coeff;i++) {
4416             if(zeros_left <= 0)
4417                 run_before = 0;
4418             else if(zeros_left < 7){
4419                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4420             }else{
4421                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4422             }
4423             zeros_left -= run_before;
4424             coeff_num -= 1 + run_before;
4425             j= scantable[ coeff_num ];
4426
4427             block[j]= level[i];
4428         }
4429     }else{
4430         block[j] = (level[0] * qmul[j] + 32)>>6;
4431         for(i=1;i<total_coeff;i++) {
4432             if(zeros_left <= 0)
4433                 run_before = 0;
4434             else if(zeros_left < 7){
4435                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4436             }else{
4437                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4438             }
4439             zeros_left -= run_before;
4440             coeff_num -= 1 + run_before;
4441             j= scantable[ coeff_num ];
4442
4443             block[j]= (level[i] * qmul[j] + 32)>>6;
4444         }
4445     }
4446
4447     if(zeros_left<0){
4448         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4449         return -1;
4450     }
4451
4452     return 0;
4453 }
4454
4455 static void predict_field_decoding_flag(H264Context *h){
4456     MpegEncContext * const s = &h->s;
4457     const int mb_xy= h->mb_xy;
4458     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4459                 ? s->current_picture.mb_type[mb_xy-1]
4460                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4461                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4462                 : 0;
4463     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4464 }
4465
4466 /**
4467  * decodes a P_SKIP or B_SKIP macroblock
4468  */
4469 static void decode_mb_skip(H264Context *h){
4470     MpegEncContext * const s = &h->s;
4471     const int mb_xy= h->mb_xy;
4472     int mb_type=0;
4473
4474     memset(h->non_zero_count[mb_xy], 0, 16);
4475     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4476
4477     if(MB_FIELD)
4478         mb_type|= MB_TYPE_INTERLACED;
4479
4480     if( h->slice_type == FF_B_TYPE )
4481     {
4482         // just for fill_caches. pred_direct_motion will set the real mb_type
4483         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4484
4485         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4486         pred_direct_motion(h, &mb_type);
4487         mb_type|= MB_TYPE_SKIP;
4488     }
4489     else
4490     {
4491         int mx, my;
4492         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4493
4494         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4495         pred_pskip_motion(h, &mx, &my);
4496         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4497         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4498     }
4499
4500     write_back_motion(h, mb_type);
4501     s->current_picture.mb_type[mb_xy]= mb_type;
4502     s->current_picture.qscale_table[mb_xy]= s->qscale;
4503     h->slice_table[ mb_xy ]= h->slice_num;
4504     h->prev_mb_skipped= 1;
4505 }
4506
4507 /**
4508  * decodes a macroblock
4509  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4510  */
4511 static int decode_mb_cavlc(H264Context *h){
4512     MpegEncContext * const s = &h->s;
4513     int mb_xy;
4514     int partition_count;
4515     unsigned int mb_type, cbp;
4516     int dct8x8_allowed= h->pps.transform_8x8_mode;
4517
4518     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4519
4520     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4521
4522     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4523     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4524                 down the code */
4525     if(h->slice_type_nos != FF_I_TYPE){
4526         if(s->mb_skip_run==-1)
4527             s->mb_skip_run= get_ue_golomb(&s->gb);
4528
4529         if (s->mb_skip_run--) {
4530             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4531                 if(s->mb_skip_run==0)
4532                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4533                 else
4534                     predict_field_decoding_flag(h);
4535             }
4536             decode_mb_skip(h);
4537             return 0;
4538         }
4539     }
4540     if(FRAME_MBAFF){
4541         if( (s->mb_y&1) == 0 )
4542             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4543     }else
4544         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4545
4546     h->prev_mb_skipped= 0;
4547
4548     mb_type= get_ue_golomb(&s->gb);
4549     if(h->slice_type == FF_B_TYPE){
4550         if(mb_type < 23){
4551             partition_count= b_mb_type_info[mb_type].partition_count;
4552             mb_type=         b_mb_type_info[mb_type].type;
4553         }else{
4554             mb_type -= 23;
4555             goto decode_intra_mb;
4556         }
4557     }else if(h->slice_type == FF_P_TYPE /*|| h->slice_type == FF_SP_TYPE */){
4558         if(mb_type < 5){
4559             partition_count= p_mb_type_info[mb_type].partition_count;
4560             mb_type=         p_mb_type_info[mb_type].type;
4561         }else{
4562             mb_type -= 5;
4563             goto decode_intra_mb;
4564         }
4565     }else{
4566        assert(h->slice_type == FF_I_TYPE);
4567 decode_intra_mb:
4568         if(mb_type > 25){
4569             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4570             return -1;
4571         }
4572         partition_count=0;
4573         cbp= i_mb_type_info[mb_type].cbp;
4574         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4575         mb_type= i_mb_type_info[mb_type].type;
4576     }
4577
4578     if(MB_FIELD)
4579         mb_type |= MB_TYPE_INTERLACED;
4580
4581     h->slice_table[ mb_xy ]= h->slice_num;
4582
4583     if(IS_INTRA_PCM(mb_type)){
4584         unsigned int x, y;
4585
4586         // We assume these blocks are very rare so we do not optimize it.
4587         align_get_bits(&s->gb);
4588
4589         // The pixels are stored in the same order as levels in h->mb array.
4590         for(y=0; y<16; y++){
4591             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4592             for(x=0; x<16; x++){
4593                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4594                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4595             }
4596         }
4597         for(y=0; y<8; y++){
4598             const int index= 256 + 4*(y&3) + 32*(y>>2);
4599             for(x=0; x<8; x++){
4600                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4601                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4602             }
4603         }
4604         for(y=0; y<8; y++){
4605             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4606             for(x=0; x<8; x++){
4607                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4608                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4609             }
4610         }
4611
4612         // In deblocking, the quantizer is 0
4613         s->current_picture.qscale_table[mb_xy]= 0;
4614         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4615         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4616         // All coeffs are present
4617         memset(h->non_zero_count[mb_xy], 16, 16);
4618
4619         s->current_picture.mb_type[mb_xy]= mb_type;
4620         return 0;
4621     }
4622
4623     if(MB_MBAFF){
4624         h->ref_count[0] <<= 1;
4625         h->ref_count[1] <<= 1;
4626     }
4627
4628     fill_caches(h, mb_type, 0);
4629
4630     //mb_pred
4631     if(IS_INTRA(mb_type)){
4632             int pred_mode;
4633 //            init_top_left_availability(h);
4634             if(IS_INTRA4x4(mb_type)){
4635                 int i;
4636                 int di = 1;
4637                 if(dct8x8_allowed && get_bits1(&s->gb)){
4638                     mb_type |= MB_TYPE_8x8DCT;
4639                     di = 4;
4640                 }
4641
4642 //                fill_intra4x4_pred_table(h);
4643                 for(i=0; i<16; i+=di){
4644                     int mode= pred_intra_mode(h, i);
4645
4646                     if(!get_bits1(&s->gb)){
4647                         const int rem_mode= get_bits(&s->gb, 3);
4648                         mode = rem_mode + (rem_mode >= mode);
4649                     }
4650
4651                     if(di==4)
4652                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4653                     else
4654                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4655                 }
4656                 write_back_intra_pred_mode(h);
4657                 if( check_intra4x4_pred_mode(h) < 0)
4658                     return -1;
4659             }else{
4660                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4661                 if(h->intra16x16_pred_mode < 0)
4662                     return -1;
4663             }
4664
4665             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4666             if(pred_mode < 0)
4667                 return -1;
4668             h->chroma_pred_mode= pred_mode;
4669     }else if(partition_count==4){
4670         int i, j, sub_partition_count[4], list, ref[2][4];
4671
4672         if(h->slice_type == FF_B_TYPE){
4673             for(i=0; i<4; i++){
4674                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4675                 if(h->sub_mb_type[i] >=13){
4676                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4677                     return -1;
4678                 }
4679                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4680                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4681             }
4682             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4683                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4684                 pred_direct_motion(h, &mb_type);
4685                 h->ref_cache[0][scan8[4]] =
4686                 h->ref_cache[1][scan8[4]] =
4687                 h->ref_cache[0][scan8[12]] =
4688                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4689             }
4690         }else{
4691             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4692             for(i=0; i<4; i++){
4693                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4694                 if(h->sub_mb_type[i] >=4){
4695                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4696                     return -1;
4697                 }
4698                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4699                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4700             }
4701         }
4702
4703         for(list=0; list<h->list_count; list++){
4704             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4705             for(i=0; i<4; i++){
4706                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4707                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4708                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4709                     if(tmp>=ref_count){
4710                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4711                         return -1;
4712                     }
4713                     ref[list][i]= tmp;
4714                 }else{
4715                  //FIXME
4716                     ref[list][i] = -1;
4717                 }
4718             }
4719         }
4720
4721         if(dct8x8_allowed)
4722             dct8x8_allowed = get_dct8x8_allowed(h);
4723
4724         for(list=0; list<h->list_count; list++){
4725             for(i=0; i<4; i++){
4726                 if(IS_DIRECT(h->sub_mb_type[i])) {
4727                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4728                     continue;
4729                 }
4730                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4731                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4732
4733                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4734                     const int sub_mb_type= h->sub_mb_type[i];
4735                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4736                     for(j=0; j<sub_partition_count[i]; j++){
4737                         int mx, my;
4738                         const int index= 4*i + block_width*j;
4739                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4740                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4741                         mx += get_se_golomb(&s->gb);
4742                         my += get_se_golomb(&s->gb);
4743                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4744
4745                         if(IS_SUB_8X8(sub_mb_type)){
4746                             mv_cache[ 1 ][0]=
4747                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4748                             mv_cache[ 1 ][1]=
4749                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4750                         }else if(IS_SUB_8X4(sub_mb_type)){
4751                             mv_cache[ 1 ][0]= mx;
4752                             mv_cache[ 1 ][1]= my;
4753                         }else if(IS_SUB_4X8(sub_mb_type)){
4754                             mv_cache[ 8 ][0]= mx;
4755                             mv_cache[ 8 ][1]= my;
4756                         }
4757                         mv_cache[ 0 ][0]= mx;
4758                         mv_cache[ 0 ][1]= my;
4759                     }
4760                 }else{
4761                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4762                     p[0] = p[1]=
4763                     p[8] = p[9]= 0;
4764                 }
4765             }
4766         }
4767     }else if(IS_DIRECT(mb_type)){
4768         pred_direct_motion(h, &mb_type);
4769         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4770     }else{
4771         int list, mx, my, i;
4772          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4773         if(IS_16X16(mb_type)){
4774             for(list=0; list<h->list_count; list++){
4775                     unsigned int val;
4776                     if(IS_DIR(mb_type, 0, list)){
4777                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4778                         if(val >= h->ref_count[list]){
4779                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4780                             return -1;
4781                         }
4782                     }else
4783                         val= LIST_NOT_USED&0xFF;
4784                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4785             }
4786             for(list=0; list<h->list_count; list++){
4787                 unsigned int val;
4788                 if(IS_DIR(mb_type, 0, list)){
4789                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4790                     mx += get_se_golomb(&s->gb);
4791                     my += get_se_golomb(&s->gb);
4792                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4793
4794                     val= pack16to32(mx,my);
4795                 }else
4796                     val=0;
4797                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4798             }
4799         }
4800         else if(IS_16X8(mb_type)){
4801             for(list=0; list<h->list_count; list++){
4802                     for(i=0; i<2; i++){
4803                         unsigned int val;
4804                         if(IS_DIR(mb_type, i, list)){
4805                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4806                             if(val >= h->ref_count[list]){
4807                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4808                                 return -1;
4809                             }
4810                         }else
4811                             val= LIST_NOT_USED&0xFF;
4812                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4813                     }
4814             }
4815             for(list=0; list<h->list_count; list++){
4816                 for(i=0; i<2; i++){
4817                     unsigned int val;
4818                     if(IS_DIR(mb_type, i, list)){
4819                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4820                         mx += get_se_golomb(&s->gb);
4821                         my += get_se_golomb(&s->gb);
4822                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4823
4824                         val= pack16to32(mx,my);
4825                     }else
4826                         val=0;
4827                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4828                 }
4829             }
4830         }else{
4831             assert(IS_8X16(mb_type));
4832             for(list=0; list<h->list_count; list++){
4833                     for(i=0; i<2; i++){
4834                         unsigned int val;
4835                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4836                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4837                             if(val >= h->ref_count[list]){
4838                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4839                                 return -1;
4840                             }
4841                         }else
4842                             val= LIST_NOT_USED&0xFF;
4843                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4844                     }
4845             }
4846             for(list=0; list<h->list_count; list++){
4847                 for(i=0; i<2; i++){
4848                     unsigned int val;
4849                     if(IS_DIR(mb_type, i, list)){
4850                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4851                         mx += get_se_golomb(&s->gb);
4852                         my += get_se_golomb(&s->gb);
4853                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4854
4855                         val= pack16to32(mx,my);
4856                     }else
4857                         val=0;
4858                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4859                 }
4860             }
4861         }
4862     }
4863
4864     if(IS_INTER(mb_type))
4865         write_back_motion(h, mb_type);
4866
4867     if(!IS_INTRA16x16(mb_type)){
4868         cbp= get_ue_golomb(&s->gb);
4869         if(cbp > 47){
4870             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4871             return -1;
4872         }
4873
4874         if(IS_INTRA4x4(mb_type))
4875             cbp= golomb_to_intra4x4_cbp[cbp];
4876         else
4877             cbp= golomb_to_inter_cbp[cbp];
4878     }
4879     h->cbp = cbp;
4880
4881     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4882         if(get_bits1(&s->gb))
4883             mb_type |= MB_TYPE_8x8DCT;
4884     }
4885     s->current_picture.mb_type[mb_xy]= mb_type;
4886
4887     if(cbp || IS_INTRA16x16(mb_type)){
4888         int i8x8, i4x4, chroma_idx;
4889         int dquant;
4890         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4891         const uint8_t *scan, *scan8x8, *dc_scan;
4892
4893 //        fill_non_zero_count_cache(h);
4894
4895         if(IS_INTERLACED(mb_type)){
4896             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4897             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4898             dc_scan= luma_dc_field_scan;
4899         }else{
4900             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4901             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4902             dc_scan= luma_dc_zigzag_scan;
4903         }
4904
4905         dquant= get_se_golomb(&s->gb);
4906
4907         if( dquant > 25 || dquant < -26 ){
4908             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4909             return -1;
4910         }
4911
4912         s->qscale += dquant;
4913         if(((unsigned)s->qscale) > 51){
4914             if(s->qscale<0) s->qscale+= 52;
4915             else            s->qscale-= 52;
4916         }
4917
4918         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4919         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4920         if(IS_INTRA16x16(mb_type)){
4921             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4922                 return -1; //FIXME continue if partitioned and other return -1 too
4923             }
4924
4925             assert((cbp&15) == 0 || (cbp&15) == 15);
4926
4927             if(cbp&15){
4928                 for(i8x8=0; i8x8<4; i8x8++){
4929                     for(i4x4=0; i4x4<4; i4x4++){
4930                         const int index= i4x4 + 4*i8x8;
4931                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4932                             return -1;
4933                         }
4934                     }
4935                 }
4936             }else{
4937                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4938             }
4939         }else{
4940             for(i8x8=0; i8x8<4; i8x8++){
4941                 if(cbp & (1<<i8x8)){
4942                     if(IS_8x8DCT(mb_type)){
4943                         DCTELEM *buf = &h->mb[64*i8x8];
4944                         uint8_t *nnz;
4945                         for(i4x4=0; i4x4<4; i4x4++){
4946                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4947                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4948                                 return -1;
4949                         }
4950                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4951                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4952                     }else{
4953                         for(i4x4=0; i4x4<4; i4x4++){
4954                             const int index= i4x4 + 4*i8x8;
4955
4956                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4957                                 return -1;
4958                             }
4959                         }
4960                     }
4961                 }else{
4962                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4963                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4964                 }
4965             }
4966         }
4967
4968         if(cbp&0x30){
4969             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4970                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4971                     return -1;
4972                 }
4973         }
4974
4975         if(cbp&0x20){
4976             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4977                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4978                 for(i4x4=0; i4x4<4; i4x4++){
4979                     const int index= 16 + 4*chroma_idx + i4x4;
4980                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4981                         return -1;
4982                     }
4983                 }
4984             }
4985         }else{
4986             uint8_t * const nnz= &h->non_zero_count_cache[0];
4987             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4988             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4989         }
4990     }else{
4991         uint8_t * const nnz= &h->non_zero_count_cache[0];
4992         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4993         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4994         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4995     }
4996     s->current_picture.qscale_table[mb_xy]= s->qscale;
4997     write_back_non_zero_count(h);
4998
4999     if(MB_MBAFF){
5000         h->ref_count[0] >>= 1;
5001         h->ref_count[1] >>= 1;
5002     }
5003
5004     return 0;
5005 }
5006
5007 static int decode_cabac_field_decoding_flag(H264Context *h) {
5008     MpegEncContext * const s = &h->s;
5009     const int mb_x = s->mb_x;
5010     const int mb_y = s->mb_y & ~1;
5011     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5012     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5013
5014     unsigned int ctx = 0;
5015
5016     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5017         ctx += 1;
5018     }
5019     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5020         ctx += 1;
5021     }
5022
5023     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5024 }
5025
5026 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5027     uint8_t *state= &h->cabac_state[ctx_base];
5028     int mb_type;
5029
5030     if(intra_slice){
5031         MpegEncContext * const s = &h->s;
5032         const int mba_xy = h->left_mb_xy[0];
5033         const int mbb_xy = h->top_mb_xy;
5034         int ctx=0;
5035         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5036             ctx++;
5037         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5038             ctx++;
5039         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5040             return 0;   /* I4x4 */
5041         state += 2;
5042     }else{
5043         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5044             return 0;   /* I4x4 */
5045     }
5046
5047     if( get_cabac_terminate( &h->cabac ) )
5048         return 25;  /* PCM */
5049
5050     mb_type = 1; /* I16x16 */
5051     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5052     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5053         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5054     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5055     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5056     return mb_type;
5057 }
5058
5059 static int decode_cabac_mb_type( H264Context *h ) {
5060     MpegEncContext * const s = &h->s;
5061
5062     if( h->slice_type == FF_I_TYPE ) {
5063         return decode_cabac_intra_mb_type(h, 3, 1);
5064     } else if( h->slice_type == FF_P_TYPE ) {
5065         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5066             /* P-type */
5067             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5068                 /* P_L0_D16x16, P_8x8 */
5069                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5070             } else {
5071                 /* P_L0_D8x16, P_L0_D16x8 */
5072                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5073             }
5074         } else {
5075             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5076         }
5077     } else if( h->slice_type == FF_B_TYPE ) {
5078         const int mba_xy = h->left_mb_xy[0];
5079         const int mbb_xy = h->top_mb_xy;
5080         int ctx = 0;
5081         int bits;
5082
5083         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5084             ctx++;
5085         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5086             ctx++;
5087
5088         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5089             return 0; /* B_Direct_16x16 */
5090
5091         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5092             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5093         }
5094
5095         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5096         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5097         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5098         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5099         if( bits < 8 )
5100             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5101         else if( bits == 13 ) {
5102             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5103         } else if( bits == 14 )
5104             return 11; /* B_L1_L0_8x16 */
5105         else if( bits == 15 )
5106             return 22; /* B_8x8 */
5107
5108         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5109         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5110     } else {
5111         /* TODO SI/SP frames? */
5112         return -1;
5113     }
5114 }
5115
5116 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5117     MpegEncContext * const s = &h->s;
5118     int mba_xy, mbb_xy;
5119     int ctx = 0;
5120
5121     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5122         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5123         mba_xy = mb_xy - 1;
5124         if( (mb_y&1)
5125             && h->slice_table[mba_xy] == h->slice_num
5126             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5127             mba_xy += s->mb_stride;
5128         if( MB_FIELD ){
5129             mbb_xy = mb_xy - s->mb_stride;
5130             if( !(mb_y&1)
5131                 && h->slice_table[mbb_xy] == h->slice_num
5132                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5133                 mbb_xy -= s->mb_stride;
5134         }else
5135             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5136     }else{
5137         int mb_xy = h->mb_xy;
5138         mba_xy = mb_xy - 1;
5139         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5140     }
5141
5142     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5143         ctx++;
5144     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5145         ctx++;
5146
5147     if( h->slice_type == FF_B_TYPE )
5148         ctx += 13;
5149     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5150 }
5151
5152 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5153     int mode = 0;
5154
5155     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5156         return pred_mode;
5157
5158     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5159     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5160     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5161
5162     if( mode >= pred_mode )
5163         return mode + 1;
5164     else
5165         return mode;
5166 }
5167
5168 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5169     const int mba_xy = h->left_mb_xy[0];
5170     const int mbb_xy = h->top_mb_xy;
5171
5172     int ctx = 0;
5173
5174     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5175     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5176         ctx++;
5177
5178     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5179         ctx++;
5180
5181     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5182         return 0;
5183
5184     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5185         return 1;
5186     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5187         return 2;
5188     else
5189         return 3;
5190 }
5191
5192 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5193     int cbp_b, cbp_a, ctx, cbp = 0;
5194
5195     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5196     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5197
5198     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5199     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5200     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5201     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5202     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5203     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5204     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5205     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5206     return cbp;
5207 }
5208 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5209     int ctx;
5210     int cbp_a, cbp_b;
5211
5212     cbp_a = (h->left_cbp>>4)&0x03;
5213     cbp_b = (h-> top_cbp>>4)&0x03;
5214
5215     ctx = 0;
5216     if( cbp_a > 0 ) ctx++;
5217     if( cbp_b > 0 ) ctx += 2;
5218     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5219         return 0;
5220
5221     ctx = 4;
5222     if( cbp_a == 2 ) ctx++;
5223     if( cbp_b == 2 ) ctx += 2;
5224     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5225 }
5226 static int decode_cabac_mb_dqp( H264Context *h) {
5227     int   ctx = 0;
5228     int   val = 0;
5229
5230     if( h->last_qscale_diff != 0 )
5231         ctx++;
5232
5233     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5234         if( ctx < 2 )
5235             ctx = 2;
5236         else
5237             ctx = 3;
5238         val++;
5239         if(val > 102) //prevent infinite loop
5240             return INT_MIN;
5241     }
5242
5243     if( val&0x01 )
5244         return (val + 1)/2;
5245     else
5246         return -(val + 1)/2;
5247 }
5248 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5249     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5250         return 0;   /* 8x8 */
5251     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5252         return 1;   /* 8x4 */
5253     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5254         return 2;   /* 4x8 */
5255     return 3;       /* 4x4 */
5256 }
5257 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5258     int type;
5259     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5260         return 0;   /* B_Direct_8x8 */
5261     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5262         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5263     type = 3;
5264     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5265         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5266             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5267         type += 4;
5268     }
5269     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5270     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5271     return type;
5272 }
5273
5274 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5275     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5276 }
5277
5278 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5279     int refa = h->ref_cache[list][scan8[n] - 1];
5280     int refb = h->ref_cache[list][scan8[n] - 8];
5281     int ref  = 0;
5282     int ctx  = 0;
5283
5284     if( h->slice_type == FF_B_TYPE) {
5285         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5286             ctx++;
5287         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5288             ctx += 2;
5289     } else {
5290         if( refa > 0 )
5291             ctx++;
5292         if( refb > 0 )
5293             ctx += 2;
5294     }
5295
5296     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5297         ref++;
5298         if( ctx < 4 )
5299             ctx = 4;
5300         else
5301             ctx = 5;
5302         if(ref >= 32 /*h->ref_list[list]*/){
5303             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5304             return 0; //FIXME we should return -1 and check the return everywhere
5305         }
5306     }
5307     return ref;
5308 }
5309
5310 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5311     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5312                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5313     int ctxbase = (l == 0) ? 40 : 47;
5314     int ctx, mvd;
5315
5316     if( amvd < 3 )
5317         ctx = 0;
5318     else if( amvd > 32 )
5319         ctx = 2;
5320     else
5321         ctx = 1;
5322
5323     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5324         return 0;
5325
5326     mvd= 1;
5327     ctx= 3;
5328     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5329         mvd++;
5330         if( ctx < 6 )
5331             ctx++;
5332     }
5333
5334     if( mvd >= 9 ) {
5335         int k = 3;
5336         while( get_cabac_bypass( &h->cabac ) ) {
5337             mvd += 1 << k;
5338             k++;
5339             if(k>24){
5340                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5341                 return INT_MIN;
5342             }
5343         }
5344         while( k-- ) {
5345             if( get_cabac_bypass( &h->cabac ) )
5346                 mvd += 1 << k;
5347         }
5348     }
5349     return get_cabac_bypass_sign( &h->cabac, -mvd );
5350 }
5351
5352 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5353     int nza, nzb;
5354     int ctx = 0;
5355
5356     if( is_dc ) {
5357         if( cat == 0 ) {
5358             nza = h->left_cbp&0x100;
5359             nzb = h-> top_cbp&0x100;
5360         } else {
5361             nza = (h->left_cbp>>(6+idx))&0x01;
5362             nzb = (h-> top_cbp>>(6+idx))&0x01;
5363         }
5364     } else {
5365         if( cat == 4 ) {
5366             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5367             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5368         } else {
5369             assert(cat == 1 || cat == 2);
5370             nza = h->non_zero_count_cache[scan8[idx] - 1];
5371             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5372         }
5373     }
5374
5375     if( nza > 0 )
5376         ctx++;
5377
5378     if( nzb > 0 )
5379         ctx += 2;
5380
5381     return ctx + 4 * cat;
5382 }
5383
5384 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5385     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5386     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5387     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5388     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5389 };
5390
5391 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5392     static const int significant_coeff_flag_offset[2][6] = {
5393       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5394       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5395     };
5396     static const int last_coeff_flag_offset[2][6] = {
5397       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5398       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5399     };
5400     static const int coeff_abs_level_m1_offset[6] = {
5401         227+0, 227+10, 227+20, 227+30, 227+39, 426
5402     };
5403     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5404       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5405         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5406         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5407        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5408       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5409         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5410         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5411         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5412     };
5413     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5414      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5415      * map node ctx => cabac ctx for level=1 */
5416     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5417     /* map node ctx => cabac ctx for level>1 */
5418     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5419     static const uint8_t coeff_abs_level_transition[2][8] = {
5420     /* update node ctx after decoding a level=1 */
5421         { 1, 2, 3, 3, 4, 5, 6, 7 },
5422     /* update node ctx after decoding a level>1 */
5423         { 4, 4, 4, 4, 5, 6, 7, 7 }
5424     };
5425
5426     int index[64];
5427
5428     int av_unused last;
5429     int coeff_count = 0;
5430     int node_ctx = 0;
5431
5432     uint8_t *significant_coeff_ctx_base;
5433     uint8_t *last_coeff_ctx_base;
5434     uint8_t *abs_level_m1_ctx_base;
5435
5436 #ifndef ARCH_X86
5437 #define CABAC_ON_STACK
5438 #endif
5439 #ifdef CABAC_ON_STACK
5440 #define CC &cc
5441     CABACContext cc;
5442     cc.range     = h->cabac.range;
5443     cc.low       = h->cabac.low;
5444     cc.bytestream= h->cabac.bytestream;
5445 #else
5446 #define CC &h->cabac
5447 #endif
5448
5449
5450     /* cat: 0-> DC 16x16  n = 0
5451      *      1-> AC 16x16  n = luma4x4idx
5452      *      2-> Luma4x4   n = luma4x4idx
5453      *      3-> DC Chroma n = iCbCr
5454      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5455      *      5-> Luma8x8   n = 4 * luma8x8idx
5456      */
5457
5458     /* read coded block flag */
5459     if( is_dc || cat != 5 ) {
5460         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5461             if( !is_dc ) {
5462                 if( cat == 4 )
5463                     h->non_zero_count_cache[scan8[16+n]] = 0;
5464                 else
5465                     h->non_zero_count_cache[scan8[n]] = 0;
5466             }
5467
5468 #ifdef CABAC_ON_STACK
5469             h->cabac.range     = cc.range     ;
5470             h->cabac.low       = cc.low       ;
5471             h->cabac.bytestream= cc.bytestream;
5472 #endif
5473             return;
5474         }
5475     }
5476
5477     significant_coeff_ctx_base = h->cabac_state
5478         + significant_coeff_flag_offset[MB_FIELD][cat];
5479     last_coeff_ctx_base = h->cabac_state
5480         + last_coeff_flag_offset[MB_FIELD][cat];
5481     abs_level_m1_ctx_base = h->cabac_state
5482         + coeff_abs_level_m1_offset[cat];
5483
5484     if( !is_dc && cat == 5 ) {
5485 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5486         for(last= 0; last < coefs; last++) { \
5487             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5488             if( get_cabac( CC, sig_ctx )) { \
5489                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5490                 index[coeff_count++] = last; \
5491                 if( get_cabac( CC, last_ctx ) ) { \
5492                     last= max_coeff; \
5493                     break; \
5494                 } \
5495             } \
5496         }\
5497         if( last == max_coeff -1 ) {\
5498             index[coeff_count++] = last;\
5499         }
5500         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5501 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5502         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5503     } else {
5504         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5505 #else
5506         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5507     } else {
5508         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5509 #endif
5510     }
5511     assert(coeff_count > 0);
5512
5513     if( is_dc ) {
5514         if( cat == 0 )
5515             h->cbp_table[h->mb_xy] |= 0x100;
5516         else
5517             h->cbp_table[h->mb_xy] |= 0x40 << n;
5518     } else {
5519         if( cat == 5 )
5520             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5521         else if( cat == 4 )
5522             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5523         else {
5524             assert( cat == 1 || cat == 2 );
5525             h->non_zero_count_cache[scan8[n]] = coeff_count;
5526         }
5527     }
5528
5529     while( coeff_count-- ) {
5530         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5531
5532         int j= scantable[index[coeff_count]];
5533
5534         if( get_cabac( CC, ctx ) == 0 ) {
5535             node_ctx = coeff_abs_level_transition[0][node_ctx];
5536             if( is_dc ) {
5537                 block[j] = get_cabac_bypass_sign( CC, -1);
5538             }else{
5539                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5540             }
5541         } else {
5542             int coeff_abs = 2;
5543             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5544             node_ctx = coeff_abs_level_transition[1][node_ctx];
5545
5546             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5547                 coeff_abs++;
5548             }
5549
5550             if( coeff_abs >= 15 ) {
5551                 int j = 0;
5552                 while( get_cabac_bypass( CC ) ) {
5553                     j++;
5554                 }
5555
5556                 coeff_abs=1;
5557                 while( j-- ) {
5558                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5559                 }
5560                 coeff_abs+= 14;
5561             }
5562
5563             if( is_dc ) {
5564                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5565             }else{
5566                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5567             }
5568         }
5569     }
5570 #ifdef CABAC_ON_STACK
5571             h->cabac.range     = cc.range     ;
5572             h->cabac.low       = cc.low       ;
5573             h->cabac.bytestream= cc.bytestream;
5574 #endif
5575
5576 }
5577
5578 #ifndef CONFIG_SMALL
5579 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5580     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5581 }
5582
5583 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5584     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5585 }
5586 #endif
5587
5588 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5589 #ifdef CONFIG_SMALL
5590     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5591 #else
5592     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5593     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5594 #endif
5595 }
5596
5597 static inline void compute_mb_neighbors(H264Context *h)
5598 {
5599     MpegEncContext * const s = &h->s;
5600     const int mb_xy  = h->mb_xy;
5601     h->top_mb_xy     = mb_xy - s->mb_stride;
5602     h->left_mb_xy[0] = mb_xy - 1;
5603     if(FRAME_MBAFF){
5604         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5605         const int top_pair_xy      = pair_xy     - s->mb_stride;
5606         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5607         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5608         const int curr_mb_frame_flag = !MB_FIELD;
5609         const int bottom = (s->mb_y & 1);
5610         if (bottom
5611                 ? !curr_mb_frame_flag // bottom macroblock
5612                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5613                 ) {
5614             h->top_mb_xy -= s->mb_stride;
5615         }
5616         if (left_mb_frame_flag != curr_mb_frame_flag) {
5617             h->left_mb_xy[0] = pair_xy - 1;
5618         }
5619     } else if (FIELD_PICTURE) {
5620         h->top_mb_xy -= s->mb_stride;
5621     }
5622     return;
5623 }
5624
5625 /**
5626  * decodes a macroblock
5627  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5628  */
5629 static int decode_mb_cabac(H264Context *h) {
5630     MpegEncContext * const s = &h->s;
5631     int mb_xy;
5632     int mb_type, partition_count, cbp = 0;
5633     int dct8x8_allowed= h->pps.transform_8x8_mode;
5634
5635     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5636
5637     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5638
5639     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5640     if( h->slice_type_nos != FF_I_TYPE ) {
5641         int skip;
5642         /* a skipped mb needs the aff flag from the following mb */
5643         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5644             predict_field_decoding_flag(h);
5645         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5646             skip = h->next_mb_skipped;
5647         else
5648             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5649         /* read skip flags */
5650         if( skip ) {
5651             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5652                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5653                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5654                 if(h->next_mb_skipped)
5655                     predict_field_decoding_flag(h);
5656                 else
5657                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5658             }
5659
5660             decode_mb_skip(h);
5661
5662             h->cbp_table[mb_xy] = 0;
5663             h->chroma_pred_mode_table[mb_xy] = 0;
5664             h->last_qscale_diff = 0;
5665
5666             return 0;
5667
5668         }
5669     }
5670     if(FRAME_MBAFF){
5671         if( (s->mb_y&1) == 0 )
5672             h->mb_mbaff =
5673             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5674     }else
5675         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5676
5677     h->prev_mb_skipped = 0;
5678
5679     compute_mb_neighbors(h);
5680     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5681         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5682         return -1;
5683     }
5684
5685     if( h->slice_type == FF_B_TYPE ) {
5686         if( mb_type < 23 ){
5687             partition_count= b_mb_type_info[mb_type].partition_count;
5688             mb_type=         b_mb_type_info[mb_type].type;
5689         }else{
5690             mb_type -= 23;
5691             goto decode_intra_mb;
5692         }
5693     } else if( h->slice_type == FF_P_TYPE ) {
5694         if( mb_type < 5) {
5695             partition_count= p_mb_type_info[mb_type].partition_count;
5696             mb_type=         p_mb_type_info[mb_type].type;
5697         } else {
5698             mb_type -= 5;
5699             goto decode_intra_mb;
5700         }
5701     } else {
5702        assert(h->slice_type == FF_I_TYPE);
5703 decode_intra_mb:
5704         partition_count = 0;
5705         cbp= i_mb_type_info[mb_type].cbp;
5706         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5707         mb_type= i_mb_type_info[mb_type].type;
5708     }
5709     if(MB_FIELD)
5710         mb_type |= MB_TYPE_INTERLACED;
5711
5712     h->slice_table[ mb_xy ]= h->slice_num;
5713
5714     if(IS_INTRA_PCM(mb_type)) {
5715         const uint8_t *ptr;
5716         unsigned int x, y;
5717
5718         // We assume these blocks are very rare so we do not optimize it.
5719         // FIXME The two following lines get the bitstream position in the cabac
5720         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5721         ptr= h->cabac.bytestream;
5722         if(h->cabac.low&0x1) ptr--;
5723         if(CABAC_BITS==16){
5724             if(h->cabac.low&0x1FF) ptr--;
5725         }
5726
5727         // The pixels are stored in the same order as levels in h->mb array.
5728         for(y=0; y<16; y++){
5729             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5730             for(x=0; x<16; x++){
5731                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5732                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5733             }
5734         }
5735         for(y=0; y<8; y++){
5736             const int index= 256 + 4*(y&3) + 32*(y>>2);
5737             for(x=0; x<8; x++){
5738                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5739                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5740             }
5741         }
5742         for(y=0; y<8; y++){
5743             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5744             for(x=0; x<8; x++){
5745                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5746                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5747             }
5748         }
5749
5750         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5751
5752         // All blocks are present
5753         h->cbp_table[mb_xy] = 0x1ef;
5754         h->chroma_pred_mode_table[mb_xy] = 0;
5755         // In deblocking, the quantizer is 0
5756         s->current_picture.qscale_table[mb_xy]= 0;
5757         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5758         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5759         // All coeffs are present
5760         memset(h->non_zero_count[mb_xy], 16, 16);
5761         s->current_picture.mb_type[mb_xy]= mb_type;
5762         h->last_qscale_diff = 0;
5763         return 0;
5764     }
5765
5766     if(MB_MBAFF){
5767         h->ref_count[0] <<= 1;
5768         h->ref_count[1] <<= 1;
5769     }
5770
5771     fill_caches(h, mb_type, 0);
5772
5773     if( IS_INTRA( mb_type ) ) {
5774         int i, pred_mode;
5775         if( IS_INTRA4x4( mb_type ) ) {
5776             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5777                 mb_type |= MB_TYPE_8x8DCT;
5778                 for( i = 0; i < 16; i+=4 ) {
5779                     int pred = pred_intra_mode( h, i );
5780                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5781                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5782                 }
5783             } else {
5784                 for( i = 0; i < 16; i++ ) {
5785                     int pred = pred_intra_mode( h, i );
5786                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5787
5788                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5789                 }
5790             }
5791             write_back_intra_pred_mode(h);
5792             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5793         } else {
5794             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5795             if( h->intra16x16_pred_mode < 0 ) return -1;
5796         }
5797         h->chroma_pred_mode_table[mb_xy] =
5798         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5799
5800         pred_mode= check_intra_pred_mode( h, pred_mode );
5801         if( pred_mode < 0 ) return -1;
5802         h->chroma_pred_mode= pred_mode;
5803     } else if( partition_count == 4 ) {
5804         int i, j, sub_partition_count[4], list, ref[2][4];
5805
5806         if( h->slice_type == FF_B_TYPE ) {
5807             for( i = 0; i < 4; i++ ) {
5808                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5809                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5810                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5811             }
5812             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5813                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5814                 pred_direct_motion(h, &mb_type);
5815                 h->ref_cache[0][scan8[4]] =
5816                 h->ref_cache[1][scan8[4]] =
5817                 h->ref_cache[0][scan8[12]] =
5818                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5819                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5820                     for( i = 0; i < 4; i++ )
5821                         if( IS_DIRECT(h->sub_mb_type[i]) )
5822                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5823                 }
5824             }
5825         } else {
5826             for( i = 0; i < 4; i++ ) {
5827                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5828                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5829                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5830             }
5831         }
5832
5833         for( list = 0; list < h->list_count; list++ ) {
5834                 for( i = 0; i < 4; i++ ) {
5835                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5836                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5837                         if( h->ref_count[list] > 1 )
5838                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5839                         else
5840                             ref[list][i] = 0;
5841                     } else {
5842                         ref[list][i] = -1;
5843                     }
5844                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5845                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5846                 }
5847         }
5848
5849         if(dct8x8_allowed)
5850             dct8x8_allowed = get_dct8x8_allowed(h);
5851
5852         for(list=0; list<h->list_count; list++){
5853             for(i=0; i<4; i++){
5854                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5855                 if(IS_DIRECT(h->sub_mb_type[i])){
5856                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5857                     continue;
5858                 }
5859
5860                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5861                     const int sub_mb_type= h->sub_mb_type[i];
5862                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5863                     for(j=0; j<sub_partition_count[i]; j++){
5864                         int mpx, mpy;
5865                         int mx, my;
5866                         const int index= 4*i + block_width*j;
5867                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5868                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5869                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5870
5871                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5872                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5873                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5874
5875                         if(IS_SUB_8X8(sub_mb_type)){
5876                             mv_cache[ 1 ][0]=
5877                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5878                             mv_cache[ 1 ][1]=
5879                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5880
5881                             mvd_cache[ 1 ][0]=
5882                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5883                             mvd_cache[ 1 ][1]=
5884                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5885                         }else if(IS_SUB_8X4(sub_mb_type)){
5886                             mv_cache[ 1 ][0]= mx;
5887                             mv_cache[ 1 ][1]= my;
5888
5889                             mvd_cache[ 1 ][0]= mx - mpx;
5890                             mvd_cache[ 1 ][1]= my - mpy;
5891                         }else if(IS_SUB_4X8(sub_mb_type)){
5892                             mv_cache[ 8 ][0]= mx;
5893                             mv_cache[ 8 ][1]= my;
5894
5895                             mvd_cache[ 8 ][0]= mx - mpx;
5896                             mvd_cache[ 8 ][1]= my - mpy;
5897                         }
5898                         mv_cache[ 0 ][0]= mx;
5899                         mv_cache[ 0 ][1]= my;
5900
5901                         mvd_cache[ 0 ][0]= mx - mpx;
5902                         mvd_cache[ 0 ][1]= my - mpy;
5903                     }
5904                 }else{
5905                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5906                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5907                     p[0] = p[1] = p[8] = p[9] = 0;
5908                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5909                 }
5910             }
5911         }
5912     } else if( IS_DIRECT(mb_type) ) {
5913         pred_direct_motion(h, &mb_type);
5914         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5915         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5916         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5917     } else {
5918         int list, mx, my, i, mpx, mpy;
5919         if(IS_16X16(mb_type)){
5920             for(list=0; list<h->list_count; list++){
5921                 if(IS_DIR(mb_type, 0, list)){
5922                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5923                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5924                 }else
5925                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5926             }
5927             for(list=0; list<h->list_count; list++){
5928                 if(IS_DIR(mb_type, 0, list)){
5929                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5930
5931                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5932                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5933                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5934
5935                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5936                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5937                 }else
5938                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5939             }
5940         }
5941         else if(IS_16X8(mb_type)){
5942             for(list=0; list<h->list_count; list++){
5943                     for(i=0; i<2; i++){
5944                         if(IS_DIR(mb_type, i, list)){
5945                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5946                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5947                         }else
5948                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5949                     }
5950             }
5951             for(list=0; list<h->list_count; list++){
5952                 for(i=0; i<2; i++){
5953                     if(IS_DIR(mb_type, i, list)){
5954                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5955                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5956                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5957                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5958
5959                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5960                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5961                     }else{
5962                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5963                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5964                     }
5965                 }
5966             }
5967         }else{
5968             assert(IS_8X16(mb_type));
5969             for(list=0; list<h->list_count; list++){
5970                     for(i=0; i<2; i++){
5971                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5972                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5973                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5974                         }else
5975                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5976                     }
5977             }
5978             for(list=0; list<h->list_count; list++){
5979                 for(i=0; i<2; i++){
5980                     if(IS_DIR(mb_type, i, list)){
5981                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5982                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5983                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5984
5985                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5986                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5987                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5988                     }else{
5989                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5990                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5991                     }
5992                 }
5993             }
5994         }
5995     }
5996
5997    if( IS_INTER( mb_type ) ) {
5998         h->chroma_pred_mode_table[mb_xy] = 0;
5999         write_back_motion( h, mb_type );
6000    }
6001
6002     if( !IS_INTRA16x16( mb_type ) ) {
6003         cbp  = decode_cabac_mb_cbp_luma( h );
6004         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6005     }
6006
6007     h->cbp_table[mb_xy] = h->cbp = cbp;
6008
6009     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6010         if( decode_cabac_mb_transform_size( h ) )
6011             mb_type |= MB_TYPE_8x8DCT;
6012     }
6013     s->current_picture.mb_type[mb_xy]= mb_type;
6014
6015     if( cbp || IS_INTRA16x16( mb_type ) ) {
6016         const uint8_t *scan, *scan8x8, *dc_scan;
6017         const uint32_t *qmul;
6018         int dqp;
6019
6020         if(IS_INTERLACED(mb_type)){
6021             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6022             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6023             dc_scan= luma_dc_field_scan;
6024         }else{
6025             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6026             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6027             dc_scan= luma_dc_zigzag_scan;
6028         }
6029
6030         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6031         if( dqp == INT_MIN ){
6032             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6033             return -1;
6034         }
6035         s->qscale += dqp;
6036         if(((unsigned)s->qscale) > 51){
6037             if(s->qscale<0) s->qscale+= 52;
6038             else            s->qscale-= 52;
6039         }
6040         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6041         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6042
6043         if( IS_INTRA16x16( mb_type ) ) {
6044             int i;
6045             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6046             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6047
6048             if( cbp&15 ) {
6049                 qmul = h->dequant4_coeff[0][s->qscale];
6050                 for( i = 0; i < 16; i++ ) {
6051                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6052                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6053                 }
6054             } else {
6055                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6056             }
6057         } else {
6058             int i8x8, i4x4;
6059             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6060                 if( cbp & (1<<i8x8) ) {
6061                     if( IS_8x8DCT(mb_type) ) {
6062                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6063                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6064                     } else {
6065                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6066                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6067                             const int index = 4*i8x8 + i4x4;
6068                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6069 //START_TIMER
6070                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6071 //STOP_TIMER("decode_residual")
6072                         }
6073                     }
6074                 } else {
6075                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6076                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6077                 }
6078             }
6079         }
6080
6081         if( cbp&0x30 ){
6082             int c;
6083             for( c = 0; c < 2; c++ ) {
6084                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6085                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6086             }
6087         }
6088
6089         if( cbp&0x20 ) {
6090             int c, i;
6091             for( c = 0; c < 2; c++ ) {
6092                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6093                 for( i = 0; i < 4; i++ ) {
6094                     const int index = 16 + 4 * c + i;
6095                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6096                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6097                 }
6098             }
6099         } else {
6100             uint8_t * const nnz= &h->non_zero_count_cache[0];
6101             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6102             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6103         }
6104     } else {
6105         uint8_t * const nnz= &h->non_zero_count_cache[0];
6106         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6107         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6108         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6109         h->last_qscale_diff = 0;
6110     }
6111
6112     s->current_picture.qscale_table[mb_xy]= s->qscale;
6113     write_back_non_zero_count(h);
6114
6115     if(MB_MBAFF){
6116         h->ref_count[0] >>= 1;
6117         h->ref_count[1] >>= 1;
6118     }
6119
6120     return 0;
6121 }
6122
6123
6124 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6125     int i, d;
6126     const int index_a = qp + h->slice_alpha_c0_offset;
6127     const int alpha = (alpha_table+52)[index_a];
6128     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6129
6130     if( bS[0] < 4 ) {
6131         int8_t tc[4];
6132         for(i=0; i<4; i++)
6133             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6134         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6135     } else {
6136         /* 16px edge length, because bS=4 is triggered by being at
6137          * the edge of an intra MB, so all 4 bS are the same */
6138             for( d = 0; d < 16; d++ ) {
6139                 const int p0 = pix[-1];
6140                 const int p1 = pix[-2];
6141                 const int p2 = pix[-3];
6142
6143                 const int q0 = pix[0];
6144                 const int q1 = pix[1];
6145                 const int q2 = pix[2];
6146
6147                 if( FFABS( p0 - q0 ) < alpha &&
6148                     FFABS( p1 - p0 ) < beta &&
6149                     FFABS( q1 - q0 ) < beta ) {
6150
6151                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6152                         if( FFABS( p2 - p0 ) < beta)
6153                         {
6154                             const int p3 = pix[-4];
6155                             /* p0', p1', p2' */
6156                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6157                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6158                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6159                         } else {
6160                             /* p0' */
6161                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6162                         }
6163                         if( FFABS( q2 - q0 ) < beta)
6164                         {
6165                             const int q3 = pix[3];
6166                             /* q0', q1', q2' */
6167                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6168                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6169                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6170                         } else {
6171                             /* q0' */
6172                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6173                         }
6174                     }else{
6175                         /* p0', q0' */
6176                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6177                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6178                     }
6179                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6180                 }
6181                 pix += stride;
6182             }
6183     }
6184 }
6185 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6186     int i;
6187     const int index_a = qp + h->slice_alpha_c0_offset;
6188     const int alpha = (alpha_table+52)[index_a];
6189     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6190
6191     if( bS[0] < 4 ) {
6192         int8_t tc[4];
6193         for(i=0; i<4; i++)
6194             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6195         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6196     } else {
6197         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6198     }
6199 }
6200
6201 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6202     int i;
6203     for( i = 0; i < 16; i++, pix += stride) {
6204         int index_a;
6205         int alpha;
6206         int beta;
6207
6208         int qp_index;
6209         int bS_index = (i >> 1);
6210         if (!MB_FIELD) {
6211             bS_index &= ~1;
6212             bS_index |= (i & 1);
6213         }
6214
6215         if( bS[bS_index] == 0 ) {
6216             continue;
6217         }
6218
6219         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6220         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6221         alpha = (alpha_table+52)[index_a];
6222         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6223
6224         if( bS[bS_index] < 4 ) {
6225             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6226             const int p0 = pix[-1];
6227             const int p1 = pix[-2];
6228             const int p2 = pix[-3];
6229             const int q0 = pix[0];
6230             const int q1 = pix[1];
6231             const int q2 = pix[2];
6232
6233             if( FFABS( p0 - q0 ) < alpha &&
6234                 FFABS( p1 - p0 ) < beta &&
6235                 FFABS( q1 - q0 ) < beta ) {
6236                 int tc = tc0;
6237                 int i_delta;
6238
6239                 if( FFABS( p2 - p0 ) < beta ) {
6240                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6241                     tc++;
6242                 }
6243                 if( FFABS( q2 - q0 ) < beta ) {
6244                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6245                     tc++;
6246                 }
6247
6248                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6249                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6250                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6251                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6252             }
6253         }else{
6254             const int p0 = pix[-1];
6255             const int p1 = pix[-2];
6256             const int p2 = pix[-3];
6257
6258             const int q0 = pix[0];
6259             const int q1 = pix[1];
6260             const int q2 = pix[2];
6261
6262             if( FFABS( p0 - q0 ) < alpha &&
6263                 FFABS( p1 - p0 ) < beta &&
6264                 FFABS( q1 - q0 ) < beta ) {
6265
6266                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6267                     if( FFABS( p2 - p0 ) < beta)
6268                     {
6269                         const int p3 = pix[-4];
6270                         /* p0', p1', p2' */
6271                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6272                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6273                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6274                     } else {
6275                         /* p0' */
6276                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6277                     }
6278                     if( FFABS( q2 - q0 ) < beta)
6279                     {
6280                         const int q3 = pix[3];
6281                         /* q0', q1', q2' */
6282                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6283                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6284                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6285                     } else {
6286                         /* q0' */
6287                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6288                     }
6289                 }else{
6290                     /* p0', q0' */
6291                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6292                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6293                 }
6294                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6295             }
6296         }
6297     }
6298 }
6299 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6300     int i;
6301     for( i = 0; i < 8; i++, pix += stride) {
6302         int index_a;
6303         int alpha;
6304         int beta;
6305
6306         int qp_index;
6307         int bS_index = i;
6308
6309         if( bS[bS_index] == 0 ) {
6310             continue;
6311         }
6312
6313         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6314         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6315         alpha = (alpha_table+52)[index_a];
6316         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6317
6318         if( bS[bS_index] < 4 ) {
6319             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6320             const int p0 = pix[-1];
6321             const int p1 = pix[-2];
6322             const int q0 = pix[0];
6323             const int q1 = pix[1];
6324
6325             if( FFABS( p0 - q0 ) < alpha &&
6326                 FFABS( p1 - p0 ) < beta &&
6327                 FFABS( q1 - q0 ) < beta ) {
6328                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6329
6330                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6331                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6332                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6333             }
6334         }else{
6335             const int p0 = pix[-1];
6336             const int p1 = pix[-2];
6337             const int q0 = pix[0];
6338             const int q1 = pix[1];
6339
6340             if( FFABS( p0 - q0 ) < alpha &&
6341                 FFABS( p1 - p0 ) < beta &&
6342                 FFABS( q1 - q0 ) < beta ) {
6343
6344                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6345                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6346                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6347             }
6348         }
6349     }
6350 }
6351
6352 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6353     int i, d;
6354     const int index_a = qp + h->slice_alpha_c0_offset;
6355     const int alpha = (alpha_table+52)[index_a];
6356     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6357     const int pix_next  = stride;
6358
6359     if( bS[0] < 4 ) {
6360         int8_t tc[4];
6361         for(i=0; i<4; i++)
6362             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6363         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6364     } else {
6365         /* 16px edge length, see filter_mb_edgev */
6366             for( d = 0; d < 16; d++ ) {
6367                 const int p0 = pix[-1*pix_next];
6368                 const int p1 = pix[-2*pix_next];
6369                 const int p2 = pix[-3*pix_next];
6370                 const int q0 = pix[0];
6371                 const int q1 = pix[1*pix_next];
6372                 const int q2 = pix[2*pix_next];
6373
6374                 if( FFABS( p0 - q0 ) < alpha &&
6375                     FFABS( p1 - p0 ) < beta &&
6376                     FFABS( q1 - q0 ) < beta ) {
6377
6378                     const int p3 = pix[-4*pix_next];
6379                     const int q3 = pix[ 3*pix_next];
6380
6381                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6382                         if( FFABS( p2 - p0 ) < beta) {
6383                             /* p0', p1', p2' */
6384                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6385                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6386                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6387                         } else {
6388                             /* p0' */
6389                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6390                         }
6391                         if( FFABS( q2 - q0 ) < beta) {
6392                             /* q0', q1', q2' */
6393                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6394                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6395                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6396                         } else {
6397                             /* q0' */
6398                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6399                         }
6400                     }else{
6401                         /* p0', q0' */
6402                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6403                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6404                     }
6405                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6406                 }
6407                 pix++;
6408             }
6409     }
6410 }
6411
6412 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6413     int i;
6414     const int index_a = qp + h->slice_alpha_c0_offset;
6415     const int alpha = (alpha_table+52)[index_a];
6416     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6417
6418     if( bS[0] < 4 ) {
6419         int8_t tc[4];
6420         for(i=0; i<4; i++)
6421             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6422         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6423     } else {
6424         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6425     }
6426 }
6427
6428 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6429     MpegEncContext * const s = &h->s;
6430     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6431     int mb_xy, mb_type;
6432     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6433
6434     mb_xy = h->mb_xy;
6435
6436     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6437 1 ||
6438        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6439                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6440         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6441         return;
6442     }
6443     assert(!FRAME_MBAFF);
6444
6445     mb_type = s->current_picture.mb_type[mb_xy];
6446     qp = s->current_picture.qscale_table[mb_xy];
6447     qp0 = s->current_picture.qscale_table[mb_xy-1];
6448     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6449     qpc = get_chroma_qp( h, 0, qp );
6450     qpc0 = get_chroma_qp( h, 0, qp0 );
6451     qpc1 = get_chroma_qp( h, 0, qp1 );
6452     qp0 = (qp + qp0 + 1) >> 1;
6453     qp1 = (qp + qp1 + 1) >> 1;
6454     qpc0 = (qpc + qpc0 + 1) >> 1;
6455     qpc1 = (qpc + qpc1 + 1) >> 1;
6456     qp_thresh = 15 - h->slice_alpha_c0_offset;
6457     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6458        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6459         return;
6460
6461     if( IS_INTRA(mb_type) ) {
6462         int16_t bS4[4] = {4,4,4,4};
6463         int16_t bS3[4] = {3,3,3,3};
6464         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6465         if( IS_8x8DCT(mb_type) ) {
6466             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6467             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6468             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6469             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6470         } else {
6471             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6472             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6473             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6474             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6475             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6476             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6477             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6478             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6479         }
6480         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6481         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6482         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6483         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6484         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6485         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6486         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6487         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6488         return;
6489     } else {
6490         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6491         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6492         int edges;
6493         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6494             edges = 4;
6495             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6496         } else {
6497             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6498                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6499             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6500                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6501                              ? 3 : 0;
6502             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6503             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6504             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6505                                               (h->slice_type == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6506         }
6507         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6508             bSv[0][0] = 0x0004000400040004ULL;
6509         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6510             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6511
6512 #define FILTER(hv,dir,edge)\
6513         if(bSv[dir][edge]) {\
6514             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6515             if(!(edge&1)) {\
6516                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6517                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6518             }\
6519         }
6520         if( edges == 1 ) {
6521             FILTER(v,0,0);
6522             FILTER(h,1,0);
6523         } else if( IS_8x8DCT(mb_type) ) {
6524             FILTER(v,0,0);
6525             FILTER(v,0,2);
6526             FILTER(h,1,0);
6527             FILTER(h,1,2);
6528         } else {
6529             FILTER(v,0,0);
6530             FILTER(v,0,1);
6531             FILTER(v,0,2);
6532             FILTER(v,0,3);
6533             FILTER(h,1,0);
6534             FILTER(h,1,1);
6535             FILTER(h,1,2);
6536             FILTER(h,1,3);
6537         }
6538 #undef FILTER
6539     }
6540 }
6541
6542 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6543     MpegEncContext * const s = &h->s;
6544     const int mb_xy= mb_x + mb_y*s->mb_stride;
6545     const int mb_type = s->current_picture.mb_type[mb_xy];
6546     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6547     int first_vertical_edge_done = 0;
6548     int dir;
6549
6550     //for sufficiently low qp, filtering wouldn't do anything
6551     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6552     if(!FRAME_MBAFF){
6553         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6554         int qp = s->current_picture.qscale_table[mb_xy];
6555         if(qp <= qp_thresh
6556            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6557            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6558             return;
6559         }
6560     }
6561
6562     if (FRAME_MBAFF
6563             // left mb is in picture
6564             && h->slice_table[mb_xy-1] != 255
6565             // and current and left pair do not have the same interlaced type
6566             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6567             // and left mb is in the same slice if deblocking_filter == 2
6568             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6569         /* First vertical edge is different in MBAFF frames
6570          * There are 8 different bS to compute and 2 different Qp
6571          */
6572         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6573         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6574         int16_t bS[8];
6575         int qp[2];
6576         int bqp[2];
6577         int rqp[2];
6578         int mb_qp, mbn0_qp, mbn1_qp;
6579         int i;
6580         first_vertical_edge_done = 1;
6581
6582         if( IS_INTRA(mb_type) )
6583             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6584         else {
6585             for( i = 0; i < 8; i++ ) {
6586                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6587
6588                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6589                     bS[i] = 4;
6590                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6591                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6592                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6593                     bS[i] = 2;
6594                 else
6595                     bS[i] = 1;
6596             }
6597         }
6598
6599         mb_qp = s->current_picture.qscale_table[mb_xy];
6600         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6601         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6602         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6603         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6604                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6605         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6606                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6607         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6608         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6609                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6610         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6611                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6612
6613         /* Filter edge */
6614         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6615         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6616         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6617         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6618         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6619     }
6620     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6621     for( dir = 0; dir < 2; dir++ )
6622     {
6623         int edge;
6624         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6625         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6626         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6627
6628         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6629                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6630         // how often to recheck mv-based bS when iterating between edges
6631         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6632                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6633         // how often to recheck mv-based bS when iterating along each edge
6634         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6635
6636         if (first_vertical_edge_done) {
6637             start = 1;
6638             first_vertical_edge_done = 0;
6639         }
6640
6641         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6642             start = 1;
6643
6644         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6645             && !IS_INTERLACED(mb_type)
6646             && IS_INTERLACED(mbm_type)
6647             ) {
6648             // This is a special case in the norm where the filtering must
6649             // be done twice (one each of the field) even if we are in a
6650             // frame macroblock.
6651             //
6652             static const int nnz_idx[4] = {4,5,6,3};
6653             unsigned int tmp_linesize   = 2 *   linesize;
6654             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6655             int mbn_xy = mb_xy - 2 * s->mb_stride;
6656             int qp;
6657             int i, j;
6658             int16_t bS[4];
6659
6660             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6661                 if( IS_INTRA(mb_type) ||
6662                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6663                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6664                 } else {
6665                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6666                     for( i = 0; i < 4; i++ ) {
6667                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6668                             mbn_nnz[nnz_idx[i]] != 0 )
6669                             bS[i] = 2;
6670                         else
6671                             bS[i] = 1;
6672                     }
6673                 }
6674                 // Do not use s->qscale as luma quantizer because it has not the same
6675                 // value in IPCM macroblocks.
6676                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6677                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6678                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6679                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6680                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6681                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6682                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6683                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6684             }
6685
6686             start = 1;
6687         }
6688
6689         /* Calculate bS */
6690         for( edge = start; edge < edges; edge++ ) {
6691             /* mbn_xy: neighbor macroblock */
6692             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6693             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6694             int16_t bS[4];
6695             int qp;
6696
6697             if( (edge&1) && IS_8x8DCT(mb_type) )
6698                 continue;
6699
6700             if( IS_INTRA(mb_type) ||
6701                 IS_INTRA(mbn_type) ) {
6702                 int value;
6703                 if (edge == 0) {
6704                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6705                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6706                     ) {
6707                         value = 4;
6708                     } else {
6709                         value = 3;
6710                     }
6711                 } else {
6712                     value = 3;
6713                 }
6714                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6715             } else {
6716                 int i, l;
6717                 int mv_done;
6718
6719                 if( edge & mask_edge ) {
6720                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6721                     mv_done = 1;
6722                 }
6723                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6724                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6725                     mv_done = 1;
6726                 }
6727                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6728                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6729                     int bn_idx= b_idx - (dir ? 8:1);
6730                     int v = 0;
6731                     int xn= h->slice_type == FF_B_TYPE && h->ref2frm[0][h->ref_cache[0][b_idx]+2] != h->ref2frm[0][h->ref_cache[0][bn_idx]+2];
6732
6733                     for( l = 0; !v && l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6734                         int ln= l^xn;
6735                         v |= h->ref2frm[l][h->ref_cache[l][b_idx]+2] != h->ref2frm[ln][h->ref_cache[ln][bn_idx]+2] ||
6736                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6737                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6738                     }
6739                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6740                     mv_done = 1;
6741                 }
6742                 else
6743                     mv_done = 0;
6744
6745                 for( i = 0; i < 4; i++ ) {
6746                     int x = dir == 0 ? edge : i;
6747                     int y = dir == 0 ? i    : edge;
6748                     int b_idx= 8 + 4 + x + 8*y;
6749                     int bn_idx= b_idx - (dir ? 8:1);
6750
6751                     if( h->non_zero_count_cache[b_idx] != 0 ||
6752                         h->non_zero_count_cache[bn_idx] != 0 ) {
6753                         bS[i] = 2;
6754                     }
6755                     else if(!mv_done)
6756                     {
6757                         int xn= h->slice_type == FF_B_TYPE && h->ref2frm[0][h->ref_cache[0][b_idx]+2] != h->ref2frm[0][h->ref_cache[0][bn_idx]+2];
6758                         bS[i] = 0;
6759                         for( l = 0; l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6760                             int ln= l^xn;
6761                             if( h->ref2frm[l][h->ref_cache[l][b_idx]+2] != h->ref2frm[ln][h->ref_cache[ln][bn_idx]+2] ||
6762                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6763                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6764                                 bS[i] = 1;
6765                                 break;
6766                             }
6767                         }
6768                     }
6769                 }
6770
6771                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6772                     continue;
6773             }
6774
6775             /* Filter edge */
6776             // Do not use s->qscale as luma quantizer because it has not the same
6777             // value in IPCM macroblocks.
6778             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6779             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6780             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6781             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6782             if( dir == 0 ) {
6783                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6784                 if( (edge&1) == 0 ) {
6785                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6786                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6787                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6788                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6789                 }
6790             } else {
6791                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6792                 if( (edge&1) == 0 ) {
6793                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6794                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6795                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6796                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6797                 }
6798             }
6799         }
6800     }
6801 }
6802
6803 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6804     MpegEncContext * const s = &h->s;
6805     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6806
6807     s->mb_skip_run= -1;
6808
6809     if( h->pps.cabac ) {
6810         int i;
6811
6812         /* realign */
6813         align_get_bits( &s->gb );
6814
6815         /* init cabac */
6816         ff_init_cabac_states( &h->cabac);
6817         ff_init_cabac_decoder( &h->cabac,
6818                                s->gb.buffer + get_bits_count(&s->gb)/8,
6819                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6820         /* calculate pre-state */
6821         for( i= 0; i < 460; i++ ) {
6822             int pre;
6823             if( h->slice_type == FF_I_TYPE )
6824                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6825             else
6826                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6827
6828             if( pre <= 63 )
6829                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6830             else
6831                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6832         }
6833
6834         for(;;){
6835 //START_TIMER
6836             int ret = decode_mb_cabac(h);
6837             int eos;
6838 //STOP_TIMER("decode_mb_cabac")
6839
6840             if(ret>=0) hl_decode_mb(h);
6841
6842             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6843                 s->mb_y++;
6844
6845                 if(ret>=0) ret = decode_mb_cabac(h);
6846
6847                 if(ret>=0) hl_decode_mb(h);
6848                 s->mb_y--;
6849             }
6850             eos = get_cabac_terminate( &h->cabac );
6851
6852             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6853                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6854                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6855                 return -1;
6856             }
6857
6858             if( ++s->mb_x >= s->mb_width ) {
6859                 s->mb_x = 0;
6860                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6861                 ++s->mb_y;
6862                 if(FIELD_OR_MBAFF_PICTURE) {
6863                     ++s->mb_y;
6864                 }
6865             }
6866
6867             if( eos || s->mb_y >= s->mb_height ) {
6868                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6869                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6870                 return 0;
6871             }
6872         }
6873
6874     } else {
6875         for(;;){
6876             int ret = decode_mb_cavlc(h);
6877
6878             if(ret>=0) hl_decode_mb(h);
6879
6880             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6881                 s->mb_y++;
6882                 ret = decode_mb_cavlc(h);
6883
6884                 if(ret>=0) hl_decode_mb(h);
6885                 s->mb_y--;
6886             }
6887
6888             if(ret<0){
6889                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6890                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6891
6892                 return -1;
6893             }
6894
6895             if(++s->mb_x >= s->mb_width){
6896                 s->mb_x=0;
6897                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6898                 ++s->mb_y;
6899                 if(FIELD_OR_MBAFF_PICTURE) {
6900                     ++s->mb_y;
6901                 }
6902                 if(s->mb_y >= s->mb_height){
6903                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6904
6905                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6906                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6907
6908                         return 0;
6909                     }else{
6910                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6911
6912                         return -1;
6913                     }
6914                 }
6915             }
6916
6917             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6918                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6919                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6920                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6921
6922                     return 0;
6923                 }else{
6924                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6925
6926                     return -1;
6927                 }
6928             }
6929         }
6930     }
6931
6932 #if 0
6933     for(;s->mb_y < s->mb_height; s->mb_y++){
6934         for(;s->mb_x < s->mb_width; s->mb_x++){
6935             int ret= decode_mb(h);
6936
6937             hl_decode_mb(h);
6938
6939             if(ret<0){
6940                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6941                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6942
6943                 return -1;
6944             }
6945
6946             if(++s->mb_x >= s->mb_width){
6947                 s->mb_x=0;
6948                 if(++s->mb_y >= s->mb_height){
6949                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6950                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6951
6952                         return 0;
6953                     }else{
6954                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6955
6956                         return -1;
6957                     }
6958                 }
6959             }
6960
6961             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6962                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6963                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6964
6965                     return 0;
6966                 }else{
6967                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6968
6969                     return -1;
6970                 }
6971             }
6972         }
6973         s->mb_x=0;
6974         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6975     }
6976 #endif
6977     return -1; //not reached
6978 }
6979
6980 static int decode_unregistered_user_data(H264Context *h, int size){
6981     MpegEncContext * const s = &h->s;
6982     uint8_t user_data[16+256];
6983     int e, build, i;
6984
6985     if(size<16)
6986         return -1;
6987
6988     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6989         user_data[i]= get_bits(&s->gb, 8);
6990     }
6991
6992     user_data[i]= 0;
6993     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6994     if(e==1 && build>=0)
6995         h->x264_build= build;
6996
6997     if(s->avctx->debug & FF_DEBUG_BUGS)
6998         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6999
7000     for(; i<size; i++)
7001         skip_bits(&s->gb, 8);
7002
7003     return 0;
7004 }
7005
7006 static int decode_sei(H264Context *h){
7007     MpegEncContext * const s = &h->s;
7008
7009     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7010         int size, type;
7011
7012         type=0;
7013         do{
7014             type+= show_bits(&s->gb, 8);
7015         }while(get_bits(&s->gb, 8) == 255);
7016
7017         size=0;
7018         do{
7019             size+= show_bits(&s->gb, 8);
7020         }while(get_bits(&s->gb, 8) == 255);
7021
7022         switch(type){
7023         case 5:
7024             if(decode_unregistered_user_data(h, size) < 0)
7025                 return -1;
7026             break;
7027         default:
7028             skip_bits(&s->gb, 8*size);
7029         }
7030
7031         //FIXME check bits here
7032         align_get_bits(&s->gb);
7033     }
7034
7035     return 0;
7036 }
7037
7038 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7039     MpegEncContext * const s = &h->s;
7040     int cpb_count, i;
7041     cpb_count = get_ue_golomb(&s->gb) + 1;
7042     get_bits(&s->gb, 4); /* bit_rate_scale */
7043     get_bits(&s->gb, 4); /* cpb_size_scale */
7044     for(i=0; i<cpb_count; i++){
7045         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7046         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7047         get_bits1(&s->gb);     /* cbr_flag */
7048     }
7049     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7050     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7051     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7052     get_bits(&s->gb, 5); /* time_offset_length */
7053 }
7054
7055 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7056     MpegEncContext * const s = &h->s;
7057     int aspect_ratio_info_present_flag;
7058     unsigned int aspect_ratio_idc;
7059     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7060
7061     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7062
7063     if( aspect_ratio_info_present_flag ) {
7064         aspect_ratio_idc= get_bits(&s->gb, 8);
7065         if( aspect_ratio_idc == EXTENDED_SAR ) {
7066             sps->sar.num= get_bits(&s->gb, 16);
7067             sps->sar.den= get_bits(&s->gb, 16);
7068         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7069             sps->sar=  pixel_aspect[aspect_ratio_idc];
7070         }else{
7071             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7072             return -1;
7073         }
7074     }else{
7075         sps->sar.num=
7076         sps->sar.den= 0;
7077     }
7078 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7079
7080     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7081         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7082     }
7083
7084     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7085         get_bits(&s->gb, 3);    /* video_format */
7086         get_bits1(&s->gb);      /* video_full_range_flag */
7087         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7088             get_bits(&s->gb, 8); /* colour_primaries */
7089             get_bits(&s->gb, 8); /* transfer_characteristics */
7090             get_bits(&s->gb, 8); /* matrix_coefficients */
7091         }
7092     }
7093
7094     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7095         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7096         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7097     }
7098
7099     sps->timing_info_present_flag = get_bits1(&s->gb);
7100     if(sps->timing_info_present_flag){
7101         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7102         sps->time_scale = get_bits_long(&s->gb, 32);
7103         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7104     }
7105
7106     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7107     if(nal_hrd_parameters_present_flag)
7108         decode_hrd_parameters(h, sps);
7109     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7110     if(vcl_hrd_parameters_present_flag)
7111         decode_hrd_parameters(h, sps);
7112     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7113         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7114     get_bits1(&s->gb);         /* pic_struct_present_flag */
7115
7116     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7117     if(sps->bitstream_restriction_flag){
7118         unsigned int num_reorder_frames;
7119         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7120         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7121         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7122         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7123         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7124         num_reorder_frames= get_ue_golomb(&s->gb);
7125         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7126
7127         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7128             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7129             return -1;
7130         }
7131
7132         sps->num_reorder_frames= num_reorder_frames;
7133     }
7134
7135     return 0;
7136 }
7137
7138 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7139                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7140     MpegEncContext * const s = &h->s;
7141     int i, last = 8, next = 8;
7142     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7143     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7144         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7145     else
7146     for(i=0;i<size;i++){
7147         if(next)
7148             next = (last + get_se_golomb(&s->gb)) & 0xff;
7149         if(!i && !next){ /* matrix not written, we use the preset one */
7150             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7151             break;
7152         }
7153         last = factors[scan[i]] = next ? next : last;
7154     }
7155 }
7156
7157 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7158                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7159     MpegEncContext * const s = &h->s;
7160     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7161     const uint8_t *fallback[4] = {
7162         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7163         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7164         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7165         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7166     };
7167     if(get_bits1(&s->gb)){
7168         sps->scaling_matrix_present |= is_sps;
7169         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7170         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7171         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7172         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7173         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7174         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7175         if(is_sps || pps->transform_8x8_mode){
7176             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7177             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7178         }
7179     } else if(fallback_sps) {
7180         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7181         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7182     }
7183 }
7184
7185 /**
7186  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7187  */
7188 static void *
7189 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7190                     const size_t size, const char *name)
7191 {
7192     if(id>=max) {
7193         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7194         return NULL;
7195     }
7196
7197     if(!vec[id]) {
7198         vec[id] = av_mallocz(size);
7199         if(vec[id] == NULL)
7200             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7201     }
7202     return vec[id];
7203 }
7204
7205 static inline int decode_seq_parameter_set(H264Context *h){
7206     MpegEncContext * const s = &h->s;
7207     int profile_idc, level_idc;
7208     unsigned int sps_id, tmp, mb_width, mb_height;
7209     int i;
7210     SPS *sps;
7211
7212     profile_idc= get_bits(&s->gb, 8);
7213     get_bits1(&s->gb);   //constraint_set0_flag
7214     get_bits1(&s->gb);   //constraint_set1_flag
7215     get_bits1(&s->gb);   //constraint_set2_flag
7216     get_bits1(&s->gb);   //constraint_set3_flag
7217     get_bits(&s->gb, 4); // reserved
7218     level_idc= get_bits(&s->gb, 8);
7219     sps_id= get_ue_golomb(&s->gb);
7220
7221     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7222     if(sps == NULL)
7223         return -1;
7224
7225     sps->profile_idc= profile_idc;
7226     sps->level_idc= level_idc;
7227
7228     if(sps->profile_idc >= 100){ //high profile
7229         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7230             get_bits1(&s->gb);  //residual_color_transform_flag
7231         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7232         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7233         sps->transform_bypass = get_bits1(&s->gb);
7234         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7235     }else
7236         sps->scaling_matrix_present = 0;
7237
7238     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7239     sps->poc_type= get_ue_golomb(&s->gb);
7240
7241     if(sps->poc_type == 0){ //FIXME #define
7242         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7243     } else if(sps->poc_type == 1){//FIXME #define
7244         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7245         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7246         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7247         tmp= get_ue_golomb(&s->gb);
7248
7249         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7250             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7251             return -1;
7252         }
7253         sps->poc_cycle_length= tmp;
7254
7255         for(i=0; i<sps->poc_cycle_length; i++)
7256             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7257     }else if(sps->poc_type != 2){
7258         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7259         return -1;
7260     }
7261
7262     tmp= get_ue_golomb(&s->gb);
7263     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7264         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7265         return -1;
7266     }
7267     sps->ref_frame_count= tmp;
7268     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7269     mb_width= get_ue_golomb(&s->gb) + 1;
7270     mb_height= get_ue_golomb(&s->gb) + 1;
7271     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7272        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7273         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7274         return -1;
7275     }
7276     sps->mb_width = mb_width;
7277     sps->mb_height= mb_height;
7278
7279     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7280     if(!sps->frame_mbs_only_flag)
7281         sps->mb_aff= get_bits1(&s->gb);
7282     else
7283         sps->mb_aff= 0;
7284
7285     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7286
7287 #ifndef ALLOW_INTERLACE
7288     if(sps->mb_aff)
7289         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7290 #endif
7291     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7292         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7293
7294     sps->crop= get_bits1(&s->gb);
7295     if(sps->crop){
7296         sps->crop_left  = get_ue_golomb(&s->gb);
7297         sps->crop_right = get_ue_golomb(&s->gb);
7298         sps->crop_top   = get_ue_golomb(&s->gb);
7299         sps->crop_bottom= get_ue_golomb(&s->gb);
7300         if(sps->crop_left || sps->crop_top){
7301             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7302         }
7303         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7304             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7305         }
7306     }else{
7307         sps->crop_left  =
7308         sps->crop_right =
7309         sps->crop_top   =
7310         sps->crop_bottom= 0;
7311     }
7312
7313     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7314     if( sps->vui_parameters_present_flag )
7315         decode_vui_parameters(h, sps);
7316
7317     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7318         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7319                sps_id, sps->profile_idc, sps->level_idc,
7320                sps->poc_type,
7321                sps->ref_frame_count,
7322                sps->mb_width, sps->mb_height,
7323                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7324                sps->direct_8x8_inference_flag ? "8B8" : "",
7325                sps->crop_left, sps->crop_right,
7326                sps->crop_top, sps->crop_bottom,
7327                sps->vui_parameters_present_flag ? "VUI" : ""
7328                );
7329     }
7330     return 0;
7331 }
7332
7333 static void
7334 build_qp_table(PPS *pps, int t, int index)
7335 {
7336     int i;
7337     for(i = 0; i < 255; i++)
7338         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7339 }
7340
7341 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7342     MpegEncContext * const s = &h->s;
7343     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7344     PPS *pps;
7345
7346     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7347     if(pps == NULL)
7348         return -1;
7349
7350     tmp= get_ue_golomb(&s->gb);
7351     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7352         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7353         return -1;
7354     }
7355     pps->sps_id= tmp;
7356
7357     pps->cabac= get_bits1(&s->gb);
7358     pps->pic_order_present= get_bits1(&s->gb);
7359     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7360     if(pps->slice_group_count > 1 ){
7361         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7362         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7363         switch(pps->mb_slice_group_map_type){
7364         case 0:
7365 #if 0
7366 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7367 |    run_length[ i ]                                |1  |ue(v)   |
7368 #endif
7369             break;
7370         case 2:
7371 #if 0
7372 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7373 |{                                                  |   |        |
7374 |    top_left_mb[ i ]                               |1  |ue(v)   |
7375 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7376 |   }                                               |   |        |
7377 #endif
7378             break;
7379         case 3:
7380         case 4:
7381         case 5:
7382 #if 0
7383 |   slice_group_change_direction_flag               |1  |u(1)    |
7384 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7385 #endif
7386             break;
7387         case 6:
7388 #if 0
7389 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7390 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7391 |)                                                  |   |        |
7392 |    slice_group_id[ i ]                            |1  |u(v)    |
7393 #endif
7394             break;
7395         }
7396     }
7397     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7398     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7399     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7400         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7401         pps->ref_count[0]= pps->ref_count[1]= 1;
7402         return -1;
7403     }
7404
7405     pps->weighted_pred= get_bits1(&s->gb);
7406     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7407     pps->init_qp= get_se_golomb(&s->gb) + 26;
7408     pps->init_qs= get_se_golomb(&s->gb) + 26;
7409     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7410     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7411     pps->constrained_intra_pred= get_bits1(&s->gb);
7412     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7413
7414     pps->transform_8x8_mode= 0;
7415     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7416     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7417     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7418
7419     if(get_bits_count(&s->gb) < bit_length){
7420         pps->transform_8x8_mode= get_bits1(&s->gb);
7421         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7422         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7423     } else {
7424         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7425     }
7426
7427     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7428     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7429         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7430         h->pps.chroma_qp_diff= 1;
7431     } else
7432         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7433
7434     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7435         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7436                pps_id, pps->sps_id,
7437                pps->cabac ? "CABAC" : "CAVLC",
7438                pps->slice_group_count,
7439                pps->ref_count[0], pps->ref_count[1],
7440                pps->weighted_pred ? "weighted" : "",
7441                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7442                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7443                pps->constrained_intra_pred ? "CONSTR" : "",
7444                pps->redundant_pic_cnt_present ? "REDU" : "",
7445                pps->transform_8x8_mode ? "8x8DCT" : ""
7446                );
7447     }
7448
7449     return 0;
7450 }
7451
7452 /**
7453  * Call decode_slice() for each context.
7454  *
7455  * @param h h264 master context
7456  * @param context_count number of contexts to execute
7457  */
7458 static void execute_decode_slices(H264Context *h, int context_count){
7459     MpegEncContext * const s = &h->s;
7460     AVCodecContext * const avctx= s->avctx;
7461     H264Context *hx;
7462     int i;
7463
7464     if(context_count == 1) {
7465         decode_slice(avctx, h);
7466     } else {
7467         for(i = 1; i < context_count; i++) {
7468             hx = h->thread_context[i];
7469             hx->s.error_resilience = avctx->error_resilience;
7470             hx->s.error_count = 0;
7471         }
7472
7473         avctx->execute(avctx, (void *)decode_slice,
7474                        (void **)h->thread_context, NULL, context_count);
7475
7476         /* pull back stuff from slices to master context */
7477         hx = h->thread_context[context_count - 1];
7478         s->mb_x = hx->s.mb_x;
7479         s->mb_y = hx->s.mb_y;
7480         s->dropable = hx->s.dropable;
7481         s->picture_structure = hx->s.picture_structure;
7482         for(i = 1; i < context_count; i++)
7483             h->s.error_count += h->thread_context[i]->s.error_count;
7484     }
7485 }
7486
7487
7488 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7489     MpegEncContext * const s = &h->s;
7490     AVCodecContext * const avctx= s->avctx;
7491     int buf_index=0;
7492     H264Context *hx; ///< thread context
7493     int context_count = 0;
7494
7495     h->max_contexts = avctx->thread_count;
7496 #if 0
7497     int i;
7498     for(i=0; i<50; i++){
7499         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7500     }
7501 #endif
7502     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7503         h->current_slice = 0;
7504         if (!s->first_field)
7505             s->current_picture_ptr= NULL;
7506     }
7507
7508     for(;;){
7509         int consumed;
7510         int dst_length;
7511         int bit_length;
7512         const uint8_t *ptr;
7513         int i, nalsize = 0;
7514         int err;
7515
7516         if(h->is_avc) {
7517             if(buf_index >= buf_size) break;
7518             nalsize = 0;
7519             for(i = 0; i < h->nal_length_size; i++)
7520                 nalsize = (nalsize << 8) | buf[buf_index++];
7521             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7522                 if(nalsize == 1){
7523                     buf_index++;
7524                     continue;
7525                 }else{
7526                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7527                     break;
7528                 }
7529             }
7530         } else {
7531             // start code prefix search
7532             for(; buf_index + 3 < buf_size; buf_index++){
7533                 // This should always succeed in the first iteration.
7534                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7535                     break;
7536             }
7537
7538             if(buf_index+3 >= buf_size) break;
7539
7540             buf_index+=3;
7541         }
7542
7543         hx = h->thread_context[context_count];
7544
7545         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7546         if (ptr==NULL || dst_length < 0){
7547             return -1;
7548         }
7549         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7550             dst_length--;
7551         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7552
7553         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7554             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7555         }
7556
7557         if (h->is_avc && (nalsize != consumed)){
7558             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7559             consumed= nalsize;
7560         }
7561
7562         buf_index += consumed;
7563
7564         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7565            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7566             continue;
7567
7568       again:
7569         err = 0;
7570         switch(hx->nal_unit_type){
7571         case NAL_IDR_SLICE:
7572             if (h->nal_unit_type != NAL_IDR_SLICE) {
7573                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7574                 return -1;
7575             }
7576             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7577         case NAL_SLICE:
7578             init_get_bits(&hx->s.gb, ptr, bit_length);
7579             hx->intra_gb_ptr=
7580             hx->inter_gb_ptr= &hx->s.gb;
7581             hx->s.data_partitioning = 0;
7582
7583             if((err = decode_slice_header(hx, h)))
7584                break;
7585
7586             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7587             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7588                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7589                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7590                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7591                && avctx->skip_frame < AVDISCARD_ALL)
7592                 context_count++;
7593             break;
7594         case NAL_DPA:
7595             init_get_bits(&hx->s.gb, ptr, bit_length);
7596             hx->intra_gb_ptr=
7597             hx->inter_gb_ptr= NULL;
7598             hx->s.data_partitioning = 1;
7599
7600             err = decode_slice_header(hx, h);
7601             break;
7602         case NAL_DPB:
7603             init_get_bits(&hx->intra_gb, ptr, bit_length);
7604             hx->intra_gb_ptr= &hx->intra_gb;
7605             break;
7606         case NAL_DPC:
7607             init_get_bits(&hx->inter_gb, ptr, bit_length);
7608             hx->inter_gb_ptr= &hx->inter_gb;
7609
7610             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7611                && s->context_initialized
7612                && s->hurry_up < 5
7613                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7614                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7615                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7616                && avctx->skip_frame < AVDISCARD_ALL)
7617                 context_count++;
7618             break;
7619         case NAL_SEI:
7620             init_get_bits(&s->gb, ptr, bit_length);
7621             decode_sei(h);
7622             break;
7623         case NAL_SPS:
7624             init_get_bits(&s->gb, ptr, bit_length);
7625             decode_seq_parameter_set(h);
7626
7627             if(s->flags& CODEC_FLAG_LOW_DELAY)
7628                 s->low_delay=1;
7629
7630             if(avctx->has_b_frames < 2)
7631                 avctx->has_b_frames= !s->low_delay;
7632             break;
7633         case NAL_PPS:
7634             init_get_bits(&s->gb, ptr, bit_length);
7635
7636             decode_picture_parameter_set(h, bit_length);
7637
7638             break;
7639         case NAL_AUD:
7640         case NAL_END_SEQUENCE:
7641         case NAL_END_STREAM:
7642         case NAL_FILLER_DATA:
7643         case NAL_SPS_EXT:
7644         case NAL_AUXILIARY_SLICE:
7645             break;
7646         default:
7647             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7648         }
7649
7650         if(context_count == h->max_contexts) {
7651             execute_decode_slices(h, context_count);
7652             context_count = 0;
7653         }
7654
7655         if (err < 0)
7656             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7657         else if(err == 1) {
7658             /* Slice could not be decoded in parallel mode, copy down
7659              * NAL unit stuff to context 0 and restart. Note that
7660              * rbsp_buffer is not transfered, but since we no longer
7661              * run in parallel mode this should not be an issue. */
7662             h->nal_unit_type = hx->nal_unit_type;
7663             h->nal_ref_idc   = hx->nal_ref_idc;
7664             hx = h;
7665             goto again;
7666         }
7667     }
7668     if(context_count)
7669         execute_decode_slices(h, context_count);
7670     return buf_index;
7671 }
7672
7673 /**
7674  * returns the number of bytes consumed for building the current frame
7675  */
7676 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7677     if(s->flags&CODEC_FLAG_TRUNCATED){
7678         pos -= s->parse_context.last_index;
7679         if(pos<0) pos=0; // FIXME remove (unneeded?)
7680
7681         return pos;
7682     }else{
7683         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7684         if(pos+10>buf_size) pos=buf_size; // oops ;)
7685
7686         return pos;
7687     }
7688 }
7689
7690 static int decode_frame(AVCodecContext *avctx,
7691                              void *data, int *data_size,
7692                              const uint8_t *buf, int buf_size)
7693 {
7694     H264Context *h = avctx->priv_data;
7695     MpegEncContext *s = &h->s;
7696     AVFrame *pict = data;
7697     int buf_index;
7698
7699     s->flags= avctx->flags;
7700     s->flags2= avctx->flags2;
7701
7702     if(s->flags&CODEC_FLAG_TRUNCATED){
7703         const int next= ff_h264_find_frame_end(h, buf, buf_size);
7704         assert((buf_size > 0) || (next == END_NOT_FOUND));
7705
7706         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7707           return buf_size;
7708 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7709     }
7710
7711    /* no supplementary picture */
7712     if (buf_size == 0) {
7713         Picture *out;
7714         int i, out_idx;
7715
7716 //FIXME factorize this with the output code below
7717         out = h->delayed_pic[0];
7718         out_idx = 0;
7719         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7720             if(h->delayed_pic[i]->poc < out->poc){
7721                 out = h->delayed_pic[i];
7722                 out_idx = i;
7723             }
7724
7725         for(i=out_idx; h->delayed_pic[i]; i++)
7726             h->delayed_pic[i] = h->delayed_pic[i+1];
7727
7728         if(out){
7729             *data_size = sizeof(AVFrame);
7730             *pict= *(AVFrame*)out;
7731         }
7732
7733         return 0;
7734     }
7735
7736     if(h->is_avc && !h->got_avcC) {
7737         int i, cnt, nalsize;
7738         unsigned char *p = avctx->extradata;
7739         if(avctx->extradata_size < 7) {
7740             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7741             return -1;
7742         }
7743         if(*p != 1) {
7744             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7745             return -1;
7746         }
7747         /* sps and pps in the avcC always have length coded with 2 bytes,
7748            so put a fake nal_length_size = 2 while parsing them */
7749         h->nal_length_size = 2;
7750         // Decode sps from avcC
7751         cnt = *(p+5) & 0x1f; // Number of sps
7752         p += 6;
7753         for (i = 0; i < cnt; i++) {
7754             nalsize = AV_RB16(p) + 2;
7755             if(decode_nal_units(h, p, nalsize) < 0) {
7756                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7757                 return -1;
7758             }
7759             p += nalsize;
7760         }
7761         // Decode pps from avcC
7762         cnt = *(p++); // Number of pps
7763         for (i = 0; i < cnt; i++) {
7764             nalsize = AV_RB16(p) + 2;
7765             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7766                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7767                 return -1;
7768             }
7769             p += nalsize;
7770         }
7771         // Now store right nal length size, that will be use to parse all other nals
7772         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7773         // Do not reparse avcC
7774         h->got_avcC = 1;
7775     }
7776
7777     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7778         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7779             return -1;
7780     }
7781
7782     buf_index=decode_nal_units(h, buf, buf_size);
7783     if(buf_index < 0)
7784         return -1;
7785
7786     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7787         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7788         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7789         return -1;
7790     }
7791
7792     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7793         Picture *out = s->current_picture_ptr;
7794         Picture *cur = s->current_picture_ptr;
7795         int i, pics, cross_idr, out_of_order, out_idx;
7796
7797         s->mb_y= 0;
7798
7799         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7800         s->current_picture_ptr->pict_type= s->pict_type;
7801
7802         h->prev_frame_num_offset= h->frame_num_offset;
7803         h->prev_frame_num= h->frame_num;
7804         if(!s->dropable) {
7805             h->prev_poc_msb= h->poc_msb;
7806             h->prev_poc_lsb= h->poc_lsb;
7807             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7808         }
7809
7810         /*
7811          * FIXME: Error handling code does not seem to support interlaced
7812          * when slices span multiple rows
7813          * The ff_er_add_slice calls don't work right for bottom
7814          * fields; they cause massive erroneous error concealing
7815          * Error marking covers both fields (top and bottom).
7816          * This causes a mismatched s->error_count
7817          * and a bad error table. Further, the error count goes to
7818          * INT_MAX when called for bottom field, because mb_y is
7819          * past end by one (callers fault) and resync_mb_y != 0
7820          * causes problems for the first MB line, too.
7821          */
7822         if (!FIELD_PICTURE)
7823             ff_er_frame_end(s);
7824
7825         MPV_frame_end(s);
7826
7827         if (s->first_field) {
7828             /* Wait for second field. */
7829             *data_size = 0;
7830
7831         } else {
7832             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7833             /* Derive top_field_first from field pocs. */
7834             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7835
7836         //FIXME do something with unavailable reference frames
7837
7838             /* Sort B-frames into display order */
7839
7840             if(h->sps.bitstream_restriction_flag
7841                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7842                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7843                 s->low_delay = 0;
7844             }
7845
7846             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7847                && !h->sps.bitstream_restriction_flag){
7848                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7849                 s->low_delay= 0;
7850             }
7851
7852             pics = 0;
7853             while(h->delayed_pic[pics]) pics++;
7854
7855             assert(pics <= MAX_DELAYED_PIC_COUNT);
7856
7857             h->delayed_pic[pics++] = cur;
7858             if(cur->reference == 0)
7859                 cur->reference = DELAYED_PIC_REF;
7860
7861             cross_idr = 0;
7862             for(i=0; h->delayed_pic[i]; i++)
7863                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7864                     cross_idr = 1;
7865
7866             out = h->delayed_pic[0];
7867             out_idx = 0;
7868             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7869                 if(h->delayed_pic[i]->poc < out->poc){
7870                     out = h->delayed_pic[i];
7871                     out_idx = i;
7872                 }
7873
7874             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7875
7876             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7877                 { }
7878             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7879                || (s->low_delay &&
7880                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7881                  || cur->pict_type == FF_B_TYPE)))
7882             {
7883                 s->low_delay = 0;
7884                 s->avctx->has_b_frames++;
7885             }
7886
7887             if(out_of_order || pics > s->avctx->has_b_frames){
7888                 out->reference &= ~DELAYED_PIC_REF;
7889                 for(i=out_idx; h->delayed_pic[i]; i++)
7890                     h->delayed_pic[i] = h->delayed_pic[i+1];
7891             }
7892             if(!out_of_order && pics > s->avctx->has_b_frames){
7893                 *data_size = sizeof(AVFrame);
7894
7895                 h->outputed_poc = out->poc;
7896                 *pict= *(AVFrame*)out;
7897             }else{
7898                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7899             }
7900         }
7901     }
7902
7903     assert(pict->data[0] || !*data_size);
7904     ff_print_debug_info(s, pict);
7905 //printf("out %d\n", (int)pict->data[0]);
7906 #if 0 //?
7907
7908     /* Return the Picture timestamp as the frame number */
7909     /* we subtract 1 because it is added on utils.c     */
7910     avctx->frame_number = s->picture_number - 1;
7911 #endif
7912     return get_consumed_bytes(s, buf_index, buf_size);
7913 }
7914 #if 0
7915 static inline void fill_mb_avail(H264Context *h){
7916     MpegEncContext * const s = &h->s;
7917     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7918
7919     if(s->mb_y){
7920         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7921         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7922         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7923     }else{
7924         h->mb_avail[0]=
7925         h->mb_avail[1]=
7926         h->mb_avail[2]= 0;
7927     }
7928     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7929     h->mb_avail[4]= 1; //FIXME move out
7930     h->mb_avail[5]= 0; //FIXME move out
7931 }
7932 #endif
7933
7934 #ifdef TEST
7935 #undef printf
7936 #undef random
7937 #define COUNT 8000
7938 #define SIZE (COUNT*40)
7939 int main(void){
7940     int i;
7941     uint8_t temp[SIZE];
7942     PutBitContext pb;
7943     GetBitContext gb;
7944 //    int int_temp[10000];
7945     DSPContext dsp;
7946     AVCodecContext avctx;
7947
7948     dsputil_init(&dsp, &avctx);
7949
7950     init_put_bits(&pb, temp, SIZE);
7951     printf("testing unsigned exp golomb\n");
7952     for(i=0; i<COUNT; i++){
7953         START_TIMER
7954         set_ue_golomb(&pb, i);
7955         STOP_TIMER("set_ue_golomb");
7956     }
7957     flush_put_bits(&pb);
7958
7959     init_get_bits(&gb, temp, 8*SIZE);
7960     for(i=0; i<COUNT; i++){
7961         int j, s;
7962
7963         s= show_bits(&gb, 24);
7964
7965         START_TIMER
7966         j= get_ue_golomb(&gb);
7967         if(j != i){
7968             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7969 //            return -1;
7970         }
7971         STOP_TIMER("get_ue_golomb");
7972     }
7973
7974
7975     init_put_bits(&pb, temp, SIZE);
7976     printf("testing signed exp golomb\n");
7977     for(i=0; i<COUNT; i++){
7978         START_TIMER
7979         set_se_golomb(&pb, i - COUNT/2);
7980         STOP_TIMER("set_se_golomb");
7981     }
7982     flush_put_bits(&pb);
7983
7984     init_get_bits(&gb, temp, 8*SIZE);
7985     for(i=0; i<COUNT; i++){
7986         int j, s;
7987
7988         s= show_bits(&gb, 24);
7989
7990         START_TIMER
7991         j= get_se_golomb(&gb);
7992         if(j != i - COUNT/2){
7993             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7994 //            return -1;
7995         }
7996         STOP_TIMER("get_se_golomb");
7997     }
7998
7999 #if 0
8000     printf("testing 4x4 (I)DCT\n");
8001
8002     DCTELEM block[16];
8003     uint8_t src[16], ref[16];
8004     uint64_t error= 0, max_error=0;
8005
8006     for(i=0; i<COUNT; i++){
8007         int j;
8008 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8009         for(j=0; j<16; j++){
8010             ref[j]= random()%255;
8011             src[j]= random()%255;
8012         }
8013
8014         h264_diff_dct_c(block, src, ref, 4);
8015
8016         //normalize
8017         for(j=0; j<16; j++){
8018 //            printf("%d ", block[j]);
8019             block[j]= block[j]*4;
8020             if(j&1) block[j]= (block[j]*4 + 2)/5;
8021             if(j&4) block[j]= (block[j]*4 + 2)/5;
8022         }
8023 //        printf("\n");
8024
8025         s->dsp.h264_idct_add(ref, block, 4);
8026 /*        for(j=0; j<16; j++){
8027             printf("%d ", ref[j]);
8028         }
8029         printf("\n");*/
8030
8031         for(j=0; j<16; j++){
8032             int diff= FFABS(src[j] - ref[j]);
8033
8034             error+= diff*diff;
8035             max_error= FFMAX(max_error, diff);
8036         }
8037     }
8038     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8039     printf("testing quantizer\n");
8040     for(qp=0; qp<52; qp++){
8041         for(i=0; i<16; i++)
8042             src1_block[i]= src2_block[i]= random()%255;
8043
8044     }
8045     printf("Testing NAL layer\n");
8046
8047     uint8_t bitstream[COUNT];
8048     uint8_t nal[COUNT*2];
8049     H264Context h;
8050     memset(&h, 0, sizeof(H264Context));
8051
8052     for(i=0; i<COUNT; i++){
8053         int zeros= i;
8054         int nal_length;
8055         int consumed;
8056         int out_length;
8057         uint8_t *out;
8058         int j;
8059
8060         for(j=0; j<COUNT; j++){
8061             bitstream[j]= (random() % 255) + 1;
8062         }
8063
8064         for(j=0; j<zeros; j++){
8065             int pos= random() % COUNT;
8066             while(bitstream[pos] == 0){
8067                 pos++;
8068                 pos %= COUNT;
8069             }
8070             bitstream[pos]=0;
8071         }
8072
8073         START_TIMER
8074
8075         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8076         if(nal_length<0){
8077             printf("encoding failed\n");
8078             return -1;
8079         }
8080
8081         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8082
8083         STOP_TIMER("NAL")
8084
8085         if(out_length != COUNT){
8086             printf("incorrect length %d %d\n", out_length, COUNT);
8087             return -1;
8088         }
8089
8090         if(consumed != nal_length){
8091             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8092             return -1;
8093         }
8094
8095         if(memcmp(bitstream, out, COUNT)){
8096             printf("mismatch\n");
8097             return -1;
8098         }
8099     }
8100 #endif
8101
8102     printf("Testing RBSP\n");
8103
8104
8105     return 0;
8106 }
8107 #endif /* TEST */
8108
8109
8110 static av_cold int decode_end(AVCodecContext *avctx)
8111 {
8112     H264Context *h = avctx->priv_data;
8113     MpegEncContext *s = &h->s;
8114
8115     av_freep(&h->rbsp_buffer[0]);
8116     av_freep(&h->rbsp_buffer[1]);
8117     free_tables(h); //FIXME cleanup init stuff perhaps
8118     MPV_common_end(s);
8119
8120 //    memset(h, 0, sizeof(H264Context));
8121
8122     return 0;
8123 }
8124
8125
8126 AVCodec h264_decoder = {
8127     "h264",
8128     CODEC_TYPE_VIDEO,
8129     CODEC_ID_H264,
8130     sizeof(H264Context),
8131     decode_init,
8132     NULL,
8133     decode_end,
8134     decode_frame,
8135     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8136     .flush= flush_dpb,
8137     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8138 };
8139
8140 #include "svq3.c"