git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = refa;
1005             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1006                 ref[list] = refb;
1007             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1008                 ref[list] = refc;
1009             if(ref[list] < 0)
1010                 ref[list] = -1;
1011         }
1012
1013         if(ref[0] < 0 && ref[1] < 0){
1014             ref[0] = ref[1] = 0;
1015             mv[0][0] = mv[0][1] =
1016             mv[1][0] = mv[1][1] = 0;
1017         }else{
1018             for(list=0; list<2; list++){
1019                 if(ref[list] >= 0)
1020                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1021                 else
1022                     mv[list][0] = mv[list][1] = 0;
1023             }
1024         }
1025
1026         if(ref[1] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L1;
1029             sub_mb_type &= ~MB_TYPE_L1;
1030         }else if(ref[0] < 0){
1031             if(!is_b8x8)
1032                 *mb_type &= ~MB_TYPE_L0;
1033             sub_mb_type &= ~MB_TYPE_L0;
1034         }
1035
1036         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1037             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1038             int mb_types_col[2];
1039             int b8_stride = h->b8_stride;
1040             int b4_stride = h->b_stride;
1041
1042             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1043
1044             if(IS_INTERLACED(*mb_type)){
1045                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1046                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1047                 if(s->mb_y&1){
1048                     l1ref0 -= 2*b8_stride;
1049                     l1ref1 -= 2*b8_stride;
1050                     l1mv0 -= 4*b4_stride;
1051                     l1mv1 -= 4*b4_stride;
1052                 }
1053                 b8_stride *= 3;
1054                 b4_stride *= 6;
1055             }else{
1056                 int cur_poc = s->current_picture_ptr->poc;
1057                 int *col_poc = h->ref_list[1]->field_poc;
1058                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1059                 int dy = 2*col_parity - (s->mb_y&1);
1060                 mb_types_col[0] =
1061                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1062                 l1ref0 += dy*b8_stride;
1063                 l1ref1 += dy*b8_stride;
1064                 l1mv0 += 2*dy*b4_stride;
1065                 l1mv1 += 2*dy*b4_stride;
1066                 b8_stride = 0;
1067             }
1068
1069             for(i8=0; i8<4; i8++){
1070                 int x8 = i8&1;
1071                 int y8 = i8>>1;
1072                 int xy8 = x8+y8*b8_stride;
1073                 int xy4 = 3*x8+y8*b4_stride;
1074                 int a=0, b=0;
1075
1076                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1077                     continue;
1078                 h->sub_mb_type[i8] = sub_mb_type;
1079
1080                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1081                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1082                 if(!IS_INTRA(mb_types_col[y8])
1083                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1084                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1085                     if(ref[0] > 0)
1086                         a= pack16to32(mv[0][0],mv[0][1]);
1087                     if(ref[1] > 0)
1088                         b= pack16to32(mv[1][0],mv[1][1]);
1089                 }else{
1090                     a= pack16to32(mv[0][0],mv[0][1]);
1091                     b= pack16to32(mv[1][0],mv[1][1]);
1092                 }
1093                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1094                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1095             }
1096         }else if(IS_16X16(*mb_type)){
1097             int a=0, b=0;
1098
1099             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1100             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1101             if(!IS_INTRA(mb_type_col)
1102                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1103                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1104                        && (h->x264_build>33 || !h->x264_build)))){
1105                 if(ref[0] > 0)
1106                     a= pack16to32(mv[0][0],mv[0][1]);
1107                 if(ref[1] > 0)
1108                     b= pack16to32(mv[1][0],mv[1][1]);
1109             }else{
1110                 a= pack16to32(mv[0][0],mv[0][1]);
1111                 b= pack16to32(mv[1][0],mv[1][1]);
1112             }
1113             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1114             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1115         }else{
1116             for(i8=0; i8<4; i8++){
1117                 const int x8 = i8&1;
1118                 const int y8 = i8>>1;
1119
1120                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1121                     continue;
1122                 h->sub_mb_type[i8] = sub_mb_type;
1123
1124                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1125                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1126                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1127                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1128
1129                 /* col_zero_flag */
1130                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1131                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1132                                                   && (h->x264_build>33 || !h->x264_build)))){
1133                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1134                     if(IS_SUB_8X8(sub_mb_type)){
1135                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1136                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1137                             if(ref[0] == 0)
1138                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1139                             if(ref[1] == 0)
1140                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1141                         }
1142                     }else
1143                     for(i4=0; i4<4; i4++){
1144                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1145                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1146                             if(ref[0] == 0)
1147                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1148                             if(ref[1] == 0)
1149                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1150                         }
1151                     }
1152                 }
1153             }
1154         }
1155     }else{ /* direct temporal mv pred */
1156         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1157         const int *dist_scale_factor = h->dist_scale_factor;
1158
1159         if(FRAME_MBAFF){
1160             if(IS_INTERLACED(*mb_type)){
1161                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1162                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1163                 dist_scale_factor = h->dist_scale_factor_field;
1164             }
1165             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1166                 /* FIXME assumes direct_8x8_inference == 1 */
1167                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1168                 int mb_types_col[2];
1169                 int y_shift;
1170
1171                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1172                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1173                          | (*mb_type & MB_TYPE_INTERLACED);
1174                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1175
1176                 if(IS_INTERLACED(*mb_type)){
1177                     /* frame to field scaling */
1178                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1179                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1180                     if(s->mb_y&1){
1181                         l1ref0 -= 2*h->b8_stride;
1182                         l1ref1 -= 2*h->b8_stride;
1183                         l1mv0 -= 4*h->b_stride;
1184                         l1mv1 -= 4*h->b_stride;
1185                     }
1186                     y_shift = 0;
1187
1188                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1189                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1190                        && !is_b8x8)
1191                         *mb_type |= MB_TYPE_16x8;
1192                     else
1193                         *mb_type |= MB_TYPE_8x8;
1194                 }else{
1195                     /* field to frame scaling */
1196                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1197                      * but in MBAFF, top and bottom POC are equal */
1198                     int dy = (s->mb_y&1) ? 1 : 2;
1199                     mb_types_col[0] =
1200                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1201                     l1ref0 += dy*h->b8_stride;
1202                     l1ref1 += dy*h->b8_stride;
1203                     l1mv0 += 2*dy*h->b_stride;
1204                     l1mv1 += 2*dy*h->b_stride;
1205                     y_shift = 2;
1206
1207                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1208                        && !is_b8x8)
1209                         *mb_type |= MB_TYPE_16x16;
1210                     else
1211                         *mb_type |= MB_TYPE_8x8;
1212                 }
1213
1214                 for(i8=0; i8<4; i8++){
1215                     const int x8 = i8&1;
1216                     const int y8 = i8>>1;
1217                     int ref0, scale;
1218                     const int16_t (*l1mv)[2]= l1mv0;
1219
1220                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1221                         continue;
1222                     h->sub_mb_type[i8] = sub_mb_type;
1223
1224                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1225                     if(IS_INTRA(mb_types_col[y8])){
1226                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1227                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1228                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1229                         continue;
1230                     }
1231
1232                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1233                     if(ref0 >= 0)
1234                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1235                     else{
1236                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1237                         l1mv= l1mv1;
1238                     }
1239                     scale = dist_scale_factor[ref0];
1240                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1241
1242                     {
1243                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1244                         int my_col = (mv_col[1]<<y_shift)/2;
1245                         int mx = (scale * mv_col[0] + 128) >> 8;
1246                         int my = (scale * my_col + 128) >> 8;
1247                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1248                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1249                     }
1250                 }
1251                 return;
1252             }
1253         }
1254
1255         /* one-to-one mv scaling */
1256
1257         if(IS_16X16(*mb_type)){
1258             int ref, mv0, mv1;
1259
1260             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1261             if(IS_INTRA(mb_type_col)){
1262                 ref=mv0=mv1=0;
1263             }else{
1264                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1265                                                 : map_col_to_list0[1][l1ref1[0]];
1266                 const int scale = dist_scale_factor[ref0];
1267                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1268                 int mv_l0[2];
1269                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1270                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1271                 ref= ref0;
1272                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1273                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1274             }
1275             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1276             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1277             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1278         }else{
1279             for(i8=0; i8<4; i8++){
1280                 const int x8 = i8&1;
1281                 const int y8 = i8>>1;
1282                 int ref0, scale;
1283                 const int16_t (*l1mv)[2]= l1mv0;
1284
1285                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1286                     continue;
1287                 h->sub_mb_type[i8] = sub_mb_type;
1288                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1289                 if(IS_INTRA(mb_type_col)){
1290                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1291                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1292                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1293                     continue;
1294                 }
1295
1296                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1297                 if(ref0 >= 0)
1298                     ref0 = map_col_to_list0[0][ref0];
1299                 else{
1300                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1301                     l1mv= l1mv1;
1302                 }
1303                 scale = dist_scale_factor[ref0];
1304
1305                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1306                 if(IS_SUB_8X8(sub_mb_type)){
1307                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1308                     int mx = (scale * mv_col[0] + 128) >> 8;
1309                     int my = (scale * mv_col[1] + 128) >> 8;
1310                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1311                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1312                 }else
1313                 for(i4=0; i4<4; i4++){
1314                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1315                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1316                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1317                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1318                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1319                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1320                 }
1321             }
1322         }
1323     }
1324 }
1325
1326 static inline void write_back_motion(H264Context *h, int mb_type){
1327     MpegEncContext * const s = &h->s;
1328     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1329     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1330     int list;
1331
1332     if(!USES_LIST(mb_type, 0))
1333         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1334
1335     for(list=0; list<h->list_count; list++){
1336         int y;
1337         if(!USES_LIST(mb_type, list))
1338             continue;
1339
1340         for(y=0; y<4; y++){
1341             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1342             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1343         }
1344         if( h->pps.cabac ) {
1345             if(IS_SKIP(mb_type))
1346                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1347             else
1348             for(y=0; y<4; y++){
1349                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1350                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1351             }
1352         }
1353
1354         {
1355             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1356             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1357             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1358             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1359             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1360         }
1361     }
1362
1363     if(h->slice_type == FF_B_TYPE && h->pps.cabac){
1364         if(IS_8X8(mb_type)){
1365             uint8_t *direct_table = &h->direct_table[b8_xy];
1366             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1367             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1368             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1369         }
1370     }
1371 }
1372
1373 /**
1374  * Decodes a network abstraction layer unit.
1375  * @param consumed is the number of bytes used as input
1376  * @param length is the length of the array
1377  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1378  * @returns decoded bytes, might be src+1 if no escapes
1379  */
1380 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1381     int i, si, di;
1382     uint8_t *dst;
1383     int bufidx;
1384
1385 //    src[0]&0x80;                //forbidden bit
1386     h->nal_ref_idc= src[0]>>5;
1387     h->nal_unit_type= src[0]&0x1F;
1388
1389     src++; length--;
1390 #if 0
1391     for(i=0; i<length; i++)
1392         printf("%2X ", src[i]);
1393 #endif
1394     for(i=0; i+1<length; i+=2){
1395         if(src[i]) continue;
1396         if(i>0 && src[i-1]==0) i--;
1397         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1398             if(src[i+2]!=3){
1399                 /* startcode, so we must be past the end */
1400                 length=i;
1401             }
1402             break;
1403         }
1404     }
1405
1406     if(i>=length-1){ //no escaped 0
1407         *dst_length= length;
1408         *consumed= length+1; //+1 for the header
1409         return src;
1410     }
1411
1412     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1413     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1414     dst= h->rbsp_buffer[bufidx];
1415
1416     if (dst == NULL){
1417         return NULL;
1418     }
1419
1420 //printf("decoding esc\n");
1421     si=di=0;
1422     while(si<length){
1423         //remove escapes (very rare 1:2^22)
1424         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1425             if(src[si+2]==3){ //escape
1426                 dst[di++]= 0;
1427                 dst[di++]= 0;
1428                 si+=3;
1429                 continue;
1430             }else //next start code
1431                 break;
1432         }
1433
1434         dst[di++]= src[si++];
1435     }
1436
1437     *dst_length= di;
1438     *consumed= si + 1;//+1 for the header
1439 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1440     return dst;
1441 }
1442
1443 /**
1444  * identifies the exact end of the bitstream
1445  * @return the length of the trailing, or 0 if damaged
1446  */
1447 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1448     int v= *src;
1449     int r;
1450
1451     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1452
1453     for(r=1; r<9; r++){
1454         if(v&1) return r;
1455         v>>=1;
1456     }
1457     return 0;
1458 }
1459
1460 /**
1461  * idct tranforms the 16 dc values and dequantize them.
1462  * @param qp quantization parameter
1463  */
1464 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1465 #define stride 16
1466     int i;
1467     int temp[16]; //FIXME check if this is a good idea
1468     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1469     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1470
1471 //memset(block, 64, 2*256);
1472 //return;
1473     for(i=0; i<4; i++){
1474         const int offset= y_offset[i];
1475         const int z0= block[offset+stride*0] + block[offset+stride*4];
1476         const int z1= block[offset+stride*0] - block[offset+stride*4];
1477         const int z2= block[offset+stride*1] - block[offset+stride*5];
1478         const int z3= block[offset+stride*1] + block[offset+stride*5];
1479
1480         temp[4*i+0]= z0+z3;
1481         temp[4*i+1]= z1+z2;
1482         temp[4*i+2]= z1-z2;
1483         temp[4*i+3]= z0-z3;
1484     }
1485
1486     for(i=0; i<4; i++){
1487         const int offset= x_offset[i];
1488         const int z0= temp[4*0+i] + temp[4*2+i];
1489         const int z1= temp[4*0+i] - temp[4*2+i];
1490         const int z2= temp[4*1+i] - temp[4*3+i];
1491         const int z3= temp[4*1+i] + temp[4*3+i];
1492
1493         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1494         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1495         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1496         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1497     }
1498 }
1499
1500 #if 0
1501 /**
1502  * dct tranforms the 16 dc values.
1503  * @param qp quantization parameter ??? FIXME
1504  */
1505 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1506 //    const int qmul= dequant_coeff[qp][0];
1507     int i;
1508     int temp[16]; //FIXME check if this is a good idea
1509     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1510     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1511
1512     for(i=0; i<4; i++){
1513         const int offset= y_offset[i];
1514         const int z0= block[offset+stride*0] + block[offset+stride*4];
1515         const int z1= block[offset+stride*0] - block[offset+stride*4];
1516         const int z2= block[offset+stride*1] - block[offset+stride*5];
1517         const int z3= block[offset+stride*1] + block[offset+stride*5];
1518
1519         temp[4*i+0]= z0+z3;
1520         temp[4*i+1]= z1+z2;
1521         temp[4*i+2]= z1-z2;
1522         temp[4*i+3]= z0-z3;
1523     }
1524
1525     for(i=0; i<4; i++){
1526         const int offset= x_offset[i];
1527         const int z0= temp[4*0+i] + temp[4*2+i];
1528         const int z1= temp[4*0+i] - temp[4*2+i];
1529         const int z2= temp[4*1+i] - temp[4*3+i];
1530         const int z3= temp[4*1+i] + temp[4*3+i];
1531
1532         block[stride*0 +offset]= (z0 + z3)>>1;
1533         block[stride*2 +offset]= (z1 + z2)>>1;
1534         block[stride*8 +offset]= (z1 - z2)>>1;
1535         block[stride*10+offset]= (z0 - z3)>>1;
1536     }
1537 }
1538 #endif
1539
1540 #undef xStride
1541 #undef stride
1542
1543 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1544     const int stride= 16*2;
1545     const int xStride= 16;
1546     int a,b,c,d,e;
1547
1548     a= block[stride*0 + xStride*0];
1549     b= block[stride*0 + xStride*1];
1550     c= block[stride*1 + xStride*0];
1551     d= block[stride*1 + xStride*1];
1552
1553     e= a-b;
1554     a= a+b;
1555     b= c-d;
1556     c= c+d;
1557
1558     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1559     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1560     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1561     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1562 }
1563
1564 #if 0
1565 static void chroma_dc_dct_c(DCTELEM *block){
1566     const int stride= 16*2;
1567     const int xStride= 16;
1568     int a,b,c,d,e;
1569
1570     a= block[stride*0 + xStride*0];
1571     b= block[stride*0 + xStride*1];
1572     c= block[stride*1 + xStride*0];
1573     d= block[stride*1 + xStride*1];
1574
1575     e= a-b;
1576     a= a+b;
1577     b= c-d;
1578     c= c+d;
1579
1580     block[stride*0 + xStride*0]= (a+c);
1581     block[stride*0 + xStride*1]= (e+b);
1582     block[stride*1 + xStride*0]= (a-c);
1583     block[stride*1 + xStride*1]= (e-b);
1584 }
1585 #endif
1586
1587 /**
1588  * gets the chroma qp.
1589  */
1590 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1591     return h->pps.chroma_qp_table[t][qscale & 0xff];
1592 }
1593
1594 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1595 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1596 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1597     int i;
1598     const int * const quant_table= quant_coeff[qscale];
1599     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1600     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1601     const unsigned int threshold2= (threshold1<<1);
1602     int last_non_zero;
1603
1604     if(separate_dc){
1605         if(qscale<=18){
1606             //avoid overflows
1607             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1608             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1609             const unsigned int dc_threshold2= (dc_threshold1<<1);
1610
1611             int level= block[0]*quant_coeff[qscale+18][0];
1612             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1613                 if(level>0){
1614                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1615                     block[0]= level;
1616                 }else{
1617                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1618                     block[0]= -level;
1619                 }
1620 //                last_non_zero = i;
1621             }else{
1622                 block[0]=0;
1623             }
1624         }else{
1625             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1626             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1627             const unsigned int dc_threshold2= (dc_threshold1<<1);
1628
1629             int level= block[0]*quant_table[0];
1630             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1631                 if(level>0){
1632                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1633                     block[0]= level;
1634                 }else{
1635                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1636                     block[0]= -level;
1637                 }
1638 //                last_non_zero = i;
1639             }else{
1640                 block[0]=0;
1641             }
1642         }
1643         last_non_zero= 0;
1644         i=1;
1645     }else{
1646         last_non_zero= -1;
1647         i=0;
1648     }
1649
1650     for(; i<16; i++){
1651         const int j= scantable[i];
1652         int level= block[j]*quant_table[j];
1653
1654 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1655 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1656         if(((unsigned)(level+threshold1))>threshold2){
1657             if(level>0){
1658                 level= (bias + level)>>QUANT_SHIFT;
1659                 block[j]= level;
1660             }else{
1661                 level= (bias - level)>>QUANT_SHIFT;
1662                 block[j]= -level;
1663             }
1664             last_non_zero = i;
1665         }else{
1666             block[j]=0;
1667         }
1668     }
1669
1670     return last_non_zero;
1671 }
1672
1673 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1674                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1675                            int src_x_offset, int src_y_offset,
1676                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1677     MpegEncContext * const s = &h->s;
1678     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1679     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1680     const int luma_xy= (mx&3) + ((my&3)<<2);
1681     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1682     uint8_t * src_cb, * src_cr;
1683     int extra_width= h->emu_edge_width;
1684     int extra_height= h->emu_edge_height;
1685     int emu=0;
1686     const int full_mx= mx>>2;
1687     const int full_my= my>>2;
1688     const int pic_width  = 16*s->mb_width;
1689     const int pic_height = 16*s->mb_height >> MB_FIELD;
1690
1691     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1692         return;
1693
1694     if(mx&7) extra_width -= 3;
1695     if(my&7) extra_height -= 3;
1696
1697     if(   full_mx < 0-extra_width
1698        || full_my < 0-extra_height
1699        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1700        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1701         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1702             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1703         emu=1;
1704     }
1705
1706     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1707     if(!square){
1708         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1709     }
1710
1711     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1712
1713     if(MB_FIELD){
1714         // chroma offset when predicting from a field of opposite parity
1715         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1716         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1717     }
1718     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1719     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1720
1721     if(emu){
1722         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1723             src_cb= s->edge_emu_buffer;
1724     }
1725     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1726
1727     if(emu){
1728         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1729             src_cr= s->edge_emu_buffer;
1730     }
1731     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1732 }
1733
1734 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1735                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1736                            int x_offset, int y_offset,
1737                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1738                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1739                            int list0, int list1){
1740     MpegEncContext * const s = &h->s;
1741     qpel_mc_func *qpix_op=  qpix_put;
1742     h264_chroma_mc_func chroma_op= chroma_put;
1743
1744     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1745     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1746     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1747     x_offset += 8*s->mb_x;
1748     y_offset += 8*(s->mb_y >> MB_FIELD);
1749
1750     if(list0){
1751         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1752         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1753                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1754                            qpix_op, chroma_op);
1755
1756         qpix_op=  qpix_avg;
1757         chroma_op= chroma_avg;
1758     }
1759
1760     if(list1){
1761         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1762         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1763                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1764                            qpix_op, chroma_op);
1765     }
1766 }
1767
1768 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1769                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1770                            int x_offset, int y_offset,
1771                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1772                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1773                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1774                            int list0, int list1){
1775     MpegEncContext * const s = &h->s;
1776
1777     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1778     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1779     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1780     x_offset += 8*s->mb_x;
1781     y_offset += 8*(s->mb_y >> MB_FIELD);
1782
1783     if(list0 && list1){
1784         /* don't optimize for luma-only case, since B-frames usually
1785          * use implicit weights => chroma too. */
1786         uint8_t *tmp_cb = s->obmc_scratchpad;
1787         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1788         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1789         int refn0 = h->ref_cache[0][ scan8[n] ];
1790         int refn1 = h->ref_cache[1][ scan8[n] ];
1791
1792         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1793                     dest_y, dest_cb, dest_cr,
1794                     x_offset, y_offset, qpix_put, chroma_put);
1795         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1796                     tmp_y, tmp_cb, tmp_cr,
1797                     x_offset, y_offset, qpix_put, chroma_put);
1798
1799         if(h->use_weight == 2){
1800             int weight0 = h->implicit_weight[refn0][refn1];
1801             int weight1 = 64 - weight0;
1802             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1803             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1804             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1805         }else{
1806             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1807                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1808                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1809             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1810                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1811                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1812             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1813                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1814                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1815         }
1816     }else{
1817         int list = list1 ? 1 : 0;
1818         int refn = h->ref_cache[list][ scan8[n] ];
1819         Picture *ref= &h->ref_list[list][refn];
1820         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1821                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1822                     qpix_put, chroma_put);
1823
1824         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1825                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1826         if(h->use_weight_chroma){
1827             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1828                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1829             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1831         }
1832     }
1833 }
1834
1835 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1836                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1837                            int x_offset, int y_offset,
1838                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1839                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1840                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1841                            int list0, int list1){
1842     if((h->use_weight==2 && list0 && list1
1843         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1844        || h->use_weight==1)
1845         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                          x_offset, y_offset, qpix_put, chroma_put,
1847                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1848     else
1849         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1850                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1851 }
1852
1853 static inline void prefetch_motion(H264Context *h, int list){
1854     /* fetch pixels for estimated mv 4 macroblocks ahead
1855      * optimized for 64byte cache lines */
1856     MpegEncContext * const s = &h->s;
1857     const int refn = h->ref_cache[list][scan8[0]];
1858     if(refn >= 0){
1859         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1860         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1861         uint8_t **src= h->ref_list[list][refn].data;
1862         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1863         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1864         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1865         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1866     }
1867 }
1868
1869 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1870                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1871                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1872                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1873     MpegEncContext * const s = &h->s;
1874     const int mb_xy= h->mb_xy;
1875     const int mb_type= s->current_picture.mb_type[mb_xy];
1876
1877     assert(IS_INTER(mb_type));
1878
1879     prefetch_motion(h, 0);
1880
1881     if(IS_16X16(mb_type)){
1882         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1883                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1884                 &weight_op[0], &weight_avg[0],
1885                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1886     }else if(IS_16X8(mb_type)){
1887         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1891         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1892                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1893                 &weight_op[1], &weight_avg[1],
1894                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1895     }else if(IS_8X16(mb_type)){
1896         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1900         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1901                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1902                 &weight_op[2], &weight_avg[2],
1903                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1904     }else{
1905         int i;
1906
1907         assert(IS_8X8(mb_type));
1908
1909         for(i=0; i<4; i++){
1910             const int sub_mb_type= h->sub_mb_type[i];
1911             const int n= 4*i;
1912             int x_offset= (i&1)<<2;
1913             int y_offset= (i&2)<<1;
1914
1915             if(IS_SUB_8X8(sub_mb_type)){
1916                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1917                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1918                     &weight_op[3], &weight_avg[3],
1919                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1920             }else if(IS_SUB_8X4(sub_mb_type)){
1921                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1926                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1927                     &weight_op[4], &weight_avg[4],
1928                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1929             }else if(IS_SUB_4X8(sub_mb_type)){
1930                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1935                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1936                     &weight_op[5], &weight_avg[5],
1937                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1938             }else{
1939                 int j;
1940                 assert(IS_SUB_4X4(sub_mb_type));
1941                 for(j=0; j<4; j++){
1942                     int sub_x_offset= x_offset + 2*(j&1);
1943                     int sub_y_offset= y_offset +   (j&2);
1944                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1945                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1946                         &weight_op[6], &weight_avg[6],
1947                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1948                 }
1949             }
1950         }
1951     }
1952
1953     prefetch_motion(h, 1);
1954 }
1955
1956 static av_cold void decode_init_vlc(void){
1957     static int done = 0;
1958
1959     if (!done) {
1960         int i;
1961         done = 1;
1962
1963         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1964                  &chroma_dc_coeff_token_len [0], 1, 1,
1965                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1966
1967         for(i=0; i<4; i++){
1968             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1969                      &coeff_token_len [i][0], 1, 1,
1970                      &coeff_token_bits[i][0], 1, 1, 1);
1971         }
1972
1973         for(i=0; i<3; i++){
1974             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1975                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1976                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1977         }
1978         for(i=0; i<15; i++){
1979             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1980                      &total_zeros_len [i][0], 1, 1,
1981                      &total_zeros_bits[i][0], 1, 1, 1);
1982         }
1983
1984         for(i=0; i<6; i++){
1985             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1986                      &run_len [i][0], 1, 1,
1987                      &run_bits[i][0], 1, 1, 1);
1988         }
1989         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1990                  &run_len [6][0], 1, 1,
1991                  &run_bits[6][0], 1, 1, 1);
1992     }
1993 }
1994
1995 static void free_tables(H264Context *h){
1996     int i;
1997     H264Context *hx;
1998     av_freep(&h->intra4x4_pred_mode);
1999     av_freep(&h->chroma_pred_mode_table);
2000     av_freep(&h->cbp_table);
2001     av_freep(&h->mvd_table[0]);
2002     av_freep(&h->mvd_table[1]);
2003     av_freep(&h->direct_table);
2004     av_freep(&h->non_zero_count);
2005     av_freep(&h->slice_table_base);
2006     h->slice_table= NULL;
2007
2008     av_freep(&h->mb2b_xy);
2009     av_freep(&h->mb2b8_xy);
2010
2011     for(i = 0; i < MAX_SPS_COUNT; i++)
2012         av_freep(h->sps_buffers + i);
2013
2014     for(i = 0; i < MAX_PPS_COUNT; i++)
2015         av_freep(h->pps_buffers + i);
2016
2017     for(i = 0; i < h->s.avctx->thread_count; i++) {
2018         hx = h->thread_context[i];
2019         if(!hx) continue;
2020         av_freep(&hx->top_borders[1]);
2021         av_freep(&hx->top_borders[0]);
2022         av_freep(&hx->s.obmc_scratchpad);
2023     }
2024 }
2025
2026 static void init_dequant8_coeff_table(H264Context *h){
2027     int i,q,x;
2028     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2029     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2030     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2031
2032     for(i=0; i<2; i++ ){
2033         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2034             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2035             break;
2036         }
2037
2038         for(q=0; q<52; q++){
2039             int shift = ff_div6[q];
2040             int idx = ff_rem6[q];
2041             for(x=0; x<64; x++)
2042                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2043                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2044                     h->pps.scaling_matrix8[i][x]) << shift;
2045         }
2046     }
2047 }
2048
2049 static void init_dequant4_coeff_table(H264Context *h){
2050     int i,j,q,x;
2051     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2052     for(i=0; i<6; i++ ){
2053         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2054         for(j=0; j<i; j++){
2055             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2056                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2057                 break;
2058             }
2059         }
2060         if(j<i)
2061             continue;
2062
2063         for(q=0; q<52; q++){
2064             int shift = ff_div6[q] + 2;
2065             int idx = ff_rem6[q];
2066             for(x=0; x<16; x++)
2067                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2068                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2069                     h->pps.scaling_matrix4[i][x]) << shift;
2070         }
2071     }
2072 }
2073
2074 static void init_dequant_tables(H264Context *h){
2075     int i,x;
2076     init_dequant4_coeff_table(h);
2077     if(h->pps.transform_8x8_mode)
2078         init_dequant8_coeff_table(h);
2079     if(h->sps.transform_bypass){
2080         for(i=0; i<6; i++)
2081             for(x=0; x<16; x++)
2082                 h->dequant4_coeff[i][0][x] = 1<<6;
2083         if(h->pps.transform_8x8_mode)
2084             for(i=0; i<2; i++)
2085                 for(x=0; x<64; x++)
2086                     h->dequant8_coeff[i][0][x] = 1<<6;
2087     }
2088 }
2089
2090
2091 /**
2092  * allocates tables.
2093  * needs width/height
2094  */
2095 static int alloc_tables(H264Context *h){
2096     MpegEncContext * const s = &h->s;
2097     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2098     int x,y;
2099
2100     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2101
2102     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2105
2106     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2107     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2108     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2109     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2110
2111     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2112     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2113
2114     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2115     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2116     for(y=0; y<s->mb_height; y++){
2117         for(x=0; x<s->mb_width; x++){
2118             const int mb_xy= x + y*s->mb_stride;
2119             const int b_xy = 4*x + 4*y*h->b_stride;
2120             const int b8_xy= 2*x + 2*y*h->b8_stride;
2121
2122             h->mb2b_xy [mb_xy]= b_xy;
2123             h->mb2b8_xy[mb_xy]= b8_xy;
2124         }
2125     }
2126
2127     s->obmc_scratchpad = NULL;
2128
2129     if(!h->dequant4_coeff[0])
2130         init_dequant_tables(h);
2131
2132     return 0;
2133 fail:
2134     free_tables(h);
2135     return -1;
2136 }
2137
2138 /**
2139  * Mimic alloc_tables(), but for every context thread.
2140  */
2141 static void clone_tables(H264Context *dst, H264Context *src){
2142     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2143     dst->non_zero_count           = src->non_zero_count;
2144     dst->slice_table              = src->slice_table;
2145     dst->cbp_table                = src->cbp_table;
2146     dst->mb2b_xy                  = src->mb2b_xy;
2147     dst->mb2b8_xy                 = src->mb2b8_xy;
2148     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2149     dst->mvd_table[0]             = src->mvd_table[0];
2150     dst->mvd_table[1]             = src->mvd_table[1];
2151     dst->direct_table             = src->direct_table;
2152
2153     dst->s.obmc_scratchpad = NULL;
2154     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2155 }
2156
2157 /**
2158  * Init context
2159  * Allocate buffers which are not shared amongst multiple threads.
2160  */
2161 static int context_init(H264Context *h){
2162     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2163     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2164
2165     return 0;
2166 fail:
2167     return -1; // free_tables will clean up for us
2168 }
2169
2170 static av_cold void common_init(H264Context *h){
2171     MpegEncContext * const s = &h->s;
2172
2173     s->width = s->avctx->width;
2174     s->height = s->avctx->height;
2175     s->codec_id= s->avctx->codec->id;
2176
2177     ff_h264_pred_init(&h->hpc, s->codec_id);
2178
2179     h->dequant_coeff_pps= -1;
2180     s->unrestricted_mv=1;
2181     s->decode=1; //FIXME
2182
2183     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2184     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2185 }
2186
2187 static av_cold int decode_init(AVCodecContext *avctx){
2188     H264Context *h= avctx->priv_data;
2189     MpegEncContext * const s = &h->s;
2190
2191     MPV_decode_defaults(s);
2192
2193     s->avctx = avctx;
2194     common_init(h);
2195
2196     s->out_format = FMT_H264;
2197     s->workaround_bugs= avctx->workaround_bugs;
2198
2199     // set defaults
2200 //    s->decode_mb= ff_h263_decode_mb;
2201     s->quarter_sample = 1;
2202     s->low_delay= 1;
2203     avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258     return 0;
2259 }
2260
2261 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2262     MpegEncContext * const s = &h->s;
2263     int i;
2264
2265     src_y  -=   linesize;
2266     src_cb -= uvlinesize;
2267     src_cr -= uvlinesize;
2268
2269     // There are two lines saved, the line above the the top macroblock of a pair,
2270     // and the line above the bottom macroblock
2271     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2272     for(i=1; i<17; i++){
2273         h->left_border[i]= src_y[15+i*  linesize];
2274     }
2275
2276     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2277     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2278
2279     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2280         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2281         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2282         for(i=1; i<9; i++){
2283             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2284             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2285         }
2286         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2287         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2288     }
2289 }
2290
2291 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2292     MpegEncContext * const s = &h->s;
2293     int temp8, i;
2294     uint64_t temp64;
2295     int deblock_left;
2296     int deblock_top;
2297     int mb_xy;
2298
2299     if(h->deblocking_filter == 2) {
2300         mb_xy = h->mb_xy;
2301         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2302         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2303     } else {
2304         deblock_left = (s->mb_x > 0);
2305         deblock_top =  (s->mb_y > 0);
2306     }
2307
2308     src_y  -=   linesize + 1;
2309     src_cb -= uvlinesize + 1;
2310     src_cr -= uvlinesize + 1;
2311
2312 #define XCHG(a,b,t,xchg)\
2313 t= a;\
2314 if(xchg)\
2315     a= b;\
2316 b= t;
2317
2318     if(deblock_left){
2319         for(i = !deblock_top; i<17; i++){
2320             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2321         }
2322     }
2323
2324     if(deblock_top){
2325         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2326         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2327         if(s->mb_x+1 < s->mb_width){
2328             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2329         }
2330     }
2331
2332     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2333         if(deblock_left){
2334             for(i = !deblock_top; i<9; i++){
2335                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2336                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2337             }
2338         }
2339         if(deblock_top){
2340             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2341             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2342         }
2343     }
2344 }
2345
2346 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2347     MpegEncContext * const s = &h->s;
2348     int i;
2349
2350     src_y  -= 2 *   linesize;
2351     src_cb -= 2 * uvlinesize;
2352     src_cr -= 2 * uvlinesize;
2353
2354     // There are two lines saved, the line above the the top macroblock of a pair,
2355     // and the line above the bottom macroblock
2356     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2357     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2358     for(i=2; i<34; i++){
2359         h->left_border[i]= src_y[15+i*  linesize];
2360     }
2361
2362     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2363     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2364     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2365     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2366
2367     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2368         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2369         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2370         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2371         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2372         for(i=2; i<18; i++){
2373             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2374             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2375         }
2376         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2377         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2378         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2379         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2380     }
2381 }
2382
2383 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2384     MpegEncContext * const s = &h->s;
2385     int temp8, i;
2386     uint64_t temp64;
2387     int deblock_left = (s->mb_x > 0);
2388     int deblock_top  = (s->mb_y > 1);
2389
2390     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2391
2392     src_y  -= 2 *   linesize + 1;
2393     src_cb -= 2 * uvlinesize + 1;
2394     src_cr -= 2 * uvlinesize + 1;
2395
2396 #define XCHG(a,b,t,xchg)\
2397 t= a;\
2398 if(xchg)\
2399     a= b;\
2400 b= t;
2401
2402     if(deblock_left){
2403         for(i = (!deblock_top)<<1; i<34; i++){
2404             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2405         }
2406     }
2407
2408     if(deblock_top){
2409         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2410         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2411         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2412         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2413         if(s->mb_x+1 < s->mb_width){
2414             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2415             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2416         }
2417     }
2418
2419     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2420         if(deblock_left){
2421             for(i = (!deblock_top) << 1; i<18; i++){
2422                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2423                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2424             }
2425         }
2426         if(deblock_top){
2427             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2428             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2429             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2430             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2431         }
2432     }
2433 }
2434
2435 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2436     MpegEncContext * const s = &h->s;
2437     const int mb_x= s->mb_x;
2438     const int mb_y= s->mb_y;
2439     const int mb_xy= h->mb_xy;
2440     const int mb_type= s->current_picture.mb_type[mb_xy];
2441     uint8_t  *dest_y, *dest_cb, *dest_cr;
2442     int linesize, uvlinesize /*dct_offset*/;
2443     int i;
2444     int *block_offset = &h->block_offset[0];
2445     const unsigned int bottom = mb_y & 1;
2446     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2447     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2448     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2449
2450     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2451     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2452     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2453
2454     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2455     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2456
2457     if (!simple && MB_FIELD) {
2458         linesize   = h->mb_linesize   = s->linesize * 2;
2459         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2460         block_offset = &h->block_offset[24];
2461         if(mb_y&1){ //FIXME move out of this func?
2462             dest_y -= s->linesize*15;
2463             dest_cb-= s->uvlinesize*7;
2464             dest_cr-= s->uvlinesize*7;
2465         }
2466         if(FRAME_MBAFF) {
2467             int list;
2468             for(list=0; list<h->list_count; list++){
2469                 if(!USES_LIST(mb_type, list))
2470                     continue;
2471                 if(IS_16X16(mb_type)){
2472                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2473                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2474                 }else{
2475                     for(i=0; i<16; i+=4){
2476                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2477                         int ref = h->ref_cache[list][scan8[i]];
2478                         if(ref >= 0)
2479                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2480                     }
2481                 }
2482             }
2483         }
2484     } else {
2485         linesize   = h->mb_linesize   = s->linesize;
2486         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2487 //        dct_offset = s->linesize * 16;
2488     }
2489
2490     if(transform_bypass){
2491         idct_dc_add =
2492         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2493     }else if(IS_8x8DCT(mb_type)){
2494         idct_dc_add = s->dsp.h264_idct8_dc_add;
2495         idct_add = s->dsp.h264_idct8_add;
2496     }else{
2497         idct_dc_add = s->dsp.h264_idct_dc_add;
2498         idct_add = s->dsp.h264_idct_add;
2499     }
2500
2501     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2502        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2503         int mbt_y = mb_y&~1;
2504         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2505         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2506         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2507         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2508     }
2509
2510     if (!simple && IS_INTRA_PCM(mb_type)) {
2511         unsigned int x, y;
2512
2513         // The pixels are stored in h->mb array in the same order as levels,
2514         // copy them in output in the correct order.
2515         for(i=0; i<16; i++) {
2516             for (y=0; y<4; y++) {
2517                 for (x=0; x<4; x++) {
2518                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2519                 }
2520             }
2521         }
2522         for(i=16; i<16+4; i++) {
2523             for (y=0; y<4; y++) {
2524                 for (x=0; x<4; x++) {
2525                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2526                 }
2527             }
2528         }
2529         for(i=20; i<20+4; i++) {
2530             for (y=0; y<4; y++) {
2531                 for (x=0; x<4; x++) {
2532                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2533                 }
2534             }
2535         }
2536     } else {
2537         if(IS_INTRA(mb_type)){
2538             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2539                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2540
2541             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2542                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2543                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2544             }
2545
2546             if(IS_INTRA4x4(mb_type)){
2547                 if(simple || !s->encoding){
2548                     if(IS_8x8DCT(mb_type)){
2549                         for(i=0; i<16; i+=4){
2550                             uint8_t * const ptr= dest_y + block_offset[i];
2551                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2552                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2553                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2554                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2555                             if(nnz){
2556                                 if(nnz == 1 && h->mb[i*16])
2557                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2558                                 else
2559                                     idct_add(ptr, h->mb + i*16, linesize);
2560                             }
2561                         }
2562                     }else
2563                     for(i=0; i<16; i++){
2564                         uint8_t * const ptr= dest_y + block_offset[i];
2565                         uint8_t *topright;
2566                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2567                         int nnz, tr;
2568
2569                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2570                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2571                             assert(mb_y || linesize <= block_offset[i]);
2572                             if(!topright_avail){
2573                                 tr= ptr[3 - linesize]*0x01010101;
2574                                 topright= (uint8_t*) &tr;
2575                             }else
2576                                 topright= ptr + 4 - linesize;
2577                         }else
2578                             topright= NULL;
2579
2580                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2581                         nnz = h->non_zero_count_cache[ scan8[i] ];
2582                         if(nnz){
2583                             if(is_h264){
2584                                 if(nnz == 1 && h->mb[i*16])
2585                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2586                                 else
2587                                     idct_add(ptr, h->mb + i*16, linesize);
2588                             }else
2589                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2590                         }
2591                     }
2592                 }
2593             }else{
2594                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2595                 if(is_h264){
2596                     if(!transform_bypass)
2597                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2598                 }else
2599                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2600             }
2601             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2602                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2603         }else if(is_h264){
2604             hl_motion(h, dest_y, dest_cb, dest_cr,
2605                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2606                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2607                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2608         }
2609
2610
2611         if(!IS_INTRA4x4(mb_type)){
2612             if(is_h264){
2613                 if(IS_INTRA16x16(mb_type)){
2614                     for(i=0; i<16; i++){
2615                         if(h->non_zero_count_cache[ scan8[i] ])
2616                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2617                         else if(h->mb[i*16])
2618                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2619                     }
2620                 }else{
2621                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2622                     for(i=0; i<16; i+=di){
2623                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2624                         if(nnz){
2625                             if(nnz==1 && h->mb[i*16])
2626                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2627                             else
2628                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2629                         }
2630                     }
2631                 }
2632             }else{
2633                 for(i=0; i<16; i++){
2634                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2635                         uint8_t * const ptr= dest_y + block_offset[i];
2636                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2637                     }
2638                 }
2639             }
2640         }
2641
2642         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2643             uint8_t *dest[2] = {dest_cb, dest_cr};
2644             if(transform_bypass){
2645                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2646             }else{
2647                 idct_add = s->dsp.h264_idct_add;
2648                 idct_dc_add = s->dsp.h264_idct_dc_add;
2649                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2650                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2651             }
2652             if(is_h264){
2653                 for(i=16; i<16+8; i++){
2654                     if(h->non_zero_count_cache[ scan8[i] ])
2655                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2656                     else if(h->mb[i*16])
2657                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2658                 }
2659             }else{
2660                 for(i=16; i<16+8; i++){
2661                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2662                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2663                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2664                     }
2665                 }
2666             }
2667         }
2668     }
2669     if(h->deblocking_filter) {
2670         if (!simple && FRAME_MBAFF) {
2671             //FIXME try deblocking one mb at a time?
2672             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2673             const int mb_y = s->mb_y - 1;
2674             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2675             const int mb_xy= mb_x + mb_y*s->mb_stride;
2676             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2677             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2678             if (!bottom) return;
2679             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2680             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2681             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2682
2683             if(IS_INTRA(mb_type_top | mb_type_bottom))
2684                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2685
2686             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2687             // deblock a pair
2688             // top
2689             s->mb_y--; h->mb_xy -= s->mb_stride;
2690             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2691             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2692             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2693             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2694             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2695             // bottom
2696             s->mb_y++; h->mb_xy += s->mb_stride;
2697             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2698             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2699             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2700             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2701             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2702         } else {
2703             tprintf(h->s.avctx, "call filter_mb\n");
2704             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2705             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2706             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2707         }
2708     }
2709 }
2710
2711 /**
2712  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2713  */
2714 static void hl_decode_mb_simple(H264Context *h){
2715     hl_decode_mb_internal(h, 1);
2716 }
2717
2718 /**
2719  * Process a macroblock; this handles edge cases, such as interlacing.
2720  */
2721 static void av_noinline hl_decode_mb_complex(H264Context *h){
2722     hl_decode_mb_internal(h, 0);
2723 }
2724
2725 static void hl_decode_mb(H264Context *h){
2726     MpegEncContext * const s = &h->s;
2727     const int mb_xy= h->mb_xy;
2728     const int mb_type= s->current_picture.mb_type[mb_xy];
2729     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2730
2731     if(ENABLE_H264_ENCODER && !s->decode)
2732         return;
2733
2734     if (is_complex)
2735         hl_decode_mb_complex(h);
2736     else hl_decode_mb_simple(h);
2737 }
2738
2739 static void pic_as_field(Picture *pic, const int parity){
2740     int i;
2741     for (i = 0; i < 4; ++i) {
2742         if (parity == PICT_BOTTOM_FIELD)
2743             pic->data[i] += pic->linesize[i];
2744         pic->reference = parity;
2745         pic->linesize[i] *= 2;
2746     }
2747 }
2748
2749 static int split_field_copy(Picture *dest, Picture *src,
2750                             int parity, int id_add){
2751     int match = !!(src->reference & parity);
2752
2753     if (match) {
2754         *dest = *src;
2755         pic_as_field(dest, parity);
2756         dest->pic_id *= 2;
2757         dest->pic_id += id_add;
2758     }
2759
2760     return match;
2761 }
2762
2763 /**
2764  * Split one reference list into field parts, interleaving by parity
2765  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2766  * set to look at the actual start of data for that field.
2767  *
2768  * @param dest output list
2769  * @param dest_len maximum number of fields to put in dest
2770  * @param src the source reference list containing fields and/or field pairs
2771  *            (aka short_ref/long_ref, or
2772  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2773  * @param src_len number of Picture's in source (pairs and unmatched fields)
2774  * @param parity the parity of the picture being decoded/needing
2775  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2776  * @return number of fields placed in dest
2777  */
2778 static int split_field_half_ref_list(Picture *dest, int dest_len,
2779                                      Picture *src,  int src_len,  int parity){
2780     int same_parity   = 1;
2781     int same_i        = 0;
2782     int opp_i         = 0;
2783     int out_i;
2784     int field_output;
2785
2786     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2787         if (same_parity && same_i < src_len) {
2788             field_output = split_field_copy(dest + out_i, src + same_i,
2789                                             parity, 1);
2790             same_parity = !field_output;
2791             same_i++;
2792
2793         } else if (opp_i < src_len) {
2794             field_output = split_field_copy(dest + out_i, src + opp_i,
2795                                             PICT_FRAME - parity, 0);
2796             same_parity = field_output;
2797             opp_i++;
2798
2799         } else {
2800             break;
2801         }
2802     }
2803
2804     return out_i;
2805 }
2806
2807 /**
2808  * Split the reference frame list into a reference field list.
2809  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2810  * The input list contains both reference field pairs and
2811  * unmatched reference fields; it is ordered as spec describes
2812  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2813  * unmatched field pairs are also present. Conceptually this is equivalent
2814  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2815  *
2816  * @param dest output reference list where ordered fields are to be placed
2817  * @param dest_len max number of fields to place at dest
2818  * @param src source reference list, as described above
2819  * @param src_len number of pictures (pairs and unmatched fields) in src
2820  * @param parity parity of field being currently decoded
2821  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2822  * @param long_i index into src array that holds first long reference picture,
2823  *        or src_len if no long refs present.
2824  */
2825 static int split_field_ref_list(Picture *dest, int dest_len,
2826                                 Picture *src,  int src_len,
2827                                 int parity,    int long_i){
2828
2829     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2830     dest += i;
2831     dest_len -= i;
2832
2833     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2834                                    src_len - long_i, parity);
2835     return i;
2836 }
2837
2838 /**
2839  * fills the default_ref_list.
2840  */
2841 static int fill_default_ref_list(H264Context *h){
2842     MpegEncContext * const s = &h->s;
2843     int i;
2844     int smallest_poc_greater_than_current = -1;
2845     int structure_sel;
2846     Picture sorted_short_ref[32];
2847     Picture field_entry_list[2][32];
2848     Picture *frame_list[2];
2849
2850     if (FIELD_PICTURE) {
2851         structure_sel = PICT_FRAME;
2852         frame_list[0] = field_entry_list[0];
2853         frame_list[1] = field_entry_list[1];
2854     } else {
2855         structure_sel = 0;
2856         frame_list[0] = h->default_ref_list[0];
2857         frame_list[1] = h->default_ref_list[1];
2858     }
2859
2860     if(h->slice_type==FF_B_TYPE){
2861         int list;
2862         int len[2];
2863         int short_len[2];
2864         int out_i;
2865         int limit= INT_MIN;
2866
2867         /* sort frame according to poc in B slice */
2868         for(out_i=0; out_i<h->short_ref_count; out_i++){
2869             int best_i=INT_MIN;
2870             int best_poc=INT_MAX;
2871
2872             for(i=0; i<h->short_ref_count; i++){
2873                 const int poc= h->short_ref[i]->poc;
2874                 if(poc > limit && poc < best_poc){
2875                     best_poc= poc;
2876                     best_i= i;
2877                 }
2878             }
2879
2880             assert(best_i != INT_MIN);
2881
2882             limit= best_poc;
2883             sorted_short_ref[out_i]= *h->short_ref[best_i];
2884             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2885             if (-1 == smallest_poc_greater_than_current) {
2886                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2887                     smallest_poc_greater_than_current = out_i;
2888                 }
2889             }
2890         }
2891
2892         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2893
2894         // find the largest poc
2895         for(list=0; list<2; list++){
2896             int index = 0;
2897             int j= -99;
2898             int step= list ? -1 : 1;
2899
2900             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2901                 int sel;
2902                 while(j<0 || j>= h->short_ref_count){
2903                     if(j != -99 && step == (list ? -1 : 1))
2904                         return -1;
2905                     step = -step;
2906                     j= smallest_poc_greater_than_current + (step>>1);
2907                 }
2908                 sel = sorted_short_ref[j].reference | structure_sel;
2909                 if(sel != PICT_FRAME) continue;
2910                 frame_list[list][index  ]= sorted_short_ref[j];
2911                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2912             }
2913             short_len[list] = index;
2914
2915             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2916                 int sel;
2917                 if(h->long_ref[i] == NULL) continue;
2918                 sel = h->long_ref[i]->reference | structure_sel;
2919                 if(sel != PICT_FRAME) continue;
2920
2921                 frame_list[ list ][index  ]= *h->long_ref[i];
2922                 frame_list[ list ][index++].pic_id= i;
2923             }
2924             len[list] = index;
2925         }
2926
2927         for(list=0; list<2; list++){
2928             if (FIELD_PICTURE)
2929                 len[list] = split_field_ref_list(h->default_ref_list[list],
2930                                                  h->ref_count[list],
2931                                                  frame_list[list],
2932                                                  len[list],
2933                                                  s->picture_structure,
2934                                                  short_len[list]);
2935
2936             // swap the two first elements of L1 when L0 and L1 are identical
2937             if(list && len[0] > 1 && len[0] == len[1])
2938                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2939                     if(i == len[0]){
2940                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2941                         break;
2942                     }
2943
2944             if(len[list] < h->ref_count[ list ])
2945                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2946         }
2947
2948
2949     }else{
2950         int index=0;
2951         int short_len;
2952         for(i=0; i<h->short_ref_count; i++){
2953             int sel;
2954             sel = h->short_ref[i]->reference | structure_sel;
2955             if(sel != PICT_FRAME) continue;
2956             frame_list[0][index  ]= *h->short_ref[i];
2957             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2958         }
2959         short_len = index;
2960         for(i = 0; i < 16; i++){
2961             int sel;
2962             if(h->long_ref[i] == NULL) continue;
2963             sel = h->long_ref[i]->reference | structure_sel;
2964             if(sel != PICT_FRAME) continue;
2965             frame_list[0][index  ]= *h->long_ref[i];
2966             frame_list[0][index++].pic_id= i;
2967         }
2968
2969         if (FIELD_PICTURE)
2970             index = split_field_ref_list(h->default_ref_list[0],
2971                                          h->ref_count[0], frame_list[0],
2972                                          index, s->picture_structure,
2973                                          short_len);
2974
2975         if(index < h->ref_count[0])
2976             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2977     }
2978 #ifdef TRACE
2979     for (i=0; i<h->ref_count[0]; i++) {
2980         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2981     }
2982     if(h->slice_type==FF_B_TYPE){
2983         for (i=0; i<h->ref_count[1]; i++) {
2984             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2985         }
2986     }
2987 #endif
2988     return 0;
2989 }
2990
2991 static void print_short_term(H264Context *h);
2992 static void print_long_term(H264Context *h);
2993
2994 /**
2995  * Extract structure information about the picture described by pic_num in
2996  * the current decoding context (frame or field). Note that pic_num is
2997  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2998  * @param pic_num picture number for which to extract structure information
2999  * @param structure one of PICT_XXX describing structure of picture
3000  *                      with pic_num
3001  * @return frame number (short term) or long term index of picture
3002  *         described by pic_num
3003  */
3004 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3005     MpegEncContext * const s = &h->s;
3006
3007     *structure = s->picture_structure;
3008     if(FIELD_PICTURE){
3009         if (!(pic_num & 1))
3010             /* opposite field */
3011             *structure ^= PICT_FRAME;
3012         pic_num >>= 1;
3013     }
3014
3015     return pic_num;
3016 }
3017
3018 static int decode_ref_pic_list_reordering(H264Context *h){
3019     MpegEncContext * const s = &h->s;
3020     int list, index, pic_structure;
3021
3022     print_short_term(h);
3023     print_long_term(h);
3024     if(h->slice_type==FF_I_TYPE || h->slice_type==FF_SI_TYPE) return 0; //FIXME move before func
3025
3026     for(list=0; list<h->list_count; list++){
3027         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3028
3029         if(get_bits1(&s->gb)){
3030             int pred= h->curr_pic_num;
3031
3032             for(index=0; ; index++){
3033                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3034                 unsigned int pic_id;
3035                 int i;
3036                 Picture *ref = NULL;
3037
3038                 if(reordering_of_pic_nums_idc==3)
3039                     break;
3040
3041                 if(index >= h->ref_count[list]){
3042                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3043                     return -1;
3044                 }
3045
3046                 if(reordering_of_pic_nums_idc<3){
3047                     if(reordering_of_pic_nums_idc<2){
3048                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3049                         int frame_num;
3050
3051                         if(abs_diff_pic_num > h->max_pic_num){
3052                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3053                             return -1;
3054                         }
3055
3056                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3057                         else                                pred+= abs_diff_pic_num;
3058                         pred &= h->max_pic_num - 1;
3059
3060                         frame_num = pic_num_extract(h, pred, &pic_structure);
3061
3062                         for(i= h->short_ref_count-1; i>=0; i--){
3063                             ref = h->short_ref[i];
3064                             assert(ref->reference);
3065                             assert(!ref->long_ref);
3066                             if(ref->data[0] != NULL &&
3067                                    ref->frame_num == frame_num &&
3068                                    (ref->reference & pic_structure) &&
3069                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3070                                 break;
3071                         }
3072                         if(i>=0)
3073                             ref->pic_id= pred;
3074                     }else{
3075                         int long_idx;
3076                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3077
3078                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3079
3080                         if(long_idx>31){
3081                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3082                             return -1;
3083                         }
3084                         ref = h->long_ref[long_idx];
3085                         assert(!(ref && !ref->reference));
3086                         if(ref && (ref->reference & pic_structure)){
3087                             ref->pic_id= pic_id;
3088                             assert(ref->long_ref);
3089                             i=0;
3090                         }else{
3091                             i=-1;
3092                         }
3093                     }
3094
3095                     if (i < 0) {
3096                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3097                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3098                     } else {
3099                         for(i=index; i+1<h->ref_count[list]; i++){
3100                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3101                                 break;
3102                         }
3103                         for(; i > index; i--){
3104                             h->ref_list[list][i]= h->ref_list[list][i-1];
3105                         }
3106                         h->ref_list[list][index]= *ref;
3107                         if (FIELD_PICTURE){
3108                             pic_as_field(&h->ref_list[list][index], pic_structure);
3109                         }
3110                     }
3111                 }else{
3112                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3113                     return -1;
3114                 }
3115             }
3116         }
3117     }
3118     for(list=0; list<h->list_count; list++){
3119         for(index= 0; index < h->ref_count[list]; index++){
3120             if(!h->ref_list[list][index].data[0])
3121                 h->ref_list[list][index]= s->current_picture;
3122         }
3123     }
3124
3125     if(h->slice_type==FF_B_TYPE && !h->direct_spatial_mv_pred)
3126         direct_dist_scale_factor(h);
3127     direct_ref_list_init(h);
3128     return 0;
3129 }
3130
3131 static void fill_mbaff_ref_list(H264Context *h){
3132     int list, i, j;
3133     for(list=0; list<2; list++){ //FIXME try list_count
3134         for(i=0; i<h->ref_count[list]; i++){
3135             Picture *frame = &h->ref_list[list][i];
3136             Picture *field = &h->ref_list[list][16+2*i];
3137             field[0] = *frame;
3138             for(j=0; j<3; j++)
3139                 field[0].linesize[j] <<= 1;
3140             field[0].reference = PICT_TOP_FIELD;
3141             field[1] = field[0];
3142             for(j=0; j<3; j++)
3143                 field[1].data[j] += frame->linesize[j];
3144             field[1].reference = PICT_BOTTOM_FIELD;
3145
3146             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3147             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3148             for(j=0; j<2; j++){
3149                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3150                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3151             }
3152         }
3153     }
3154     for(j=0; j<h->ref_count[1]; j++){
3155         for(i=0; i<h->ref_count[0]; i++)
3156             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3157         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3158         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3159     }
3160 }
3161
3162 static int pred_weight_table(H264Context *h){
3163     MpegEncContext * const s = &h->s;
3164     int list, i;
3165     int luma_def, chroma_def;
3166
3167     h->use_weight= 0;
3168     h->use_weight_chroma= 0;
3169     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3170     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3171     luma_def = 1<<h->luma_log2_weight_denom;
3172     chroma_def = 1<<h->chroma_log2_weight_denom;
3173
3174     for(list=0; list<2; list++){
3175         for(i=0; i<h->ref_count[list]; i++){
3176             int luma_weight_flag, chroma_weight_flag;
3177
3178             luma_weight_flag= get_bits1(&s->gb);
3179             if(luma_weight_flag){
3180                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3181                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3182                 if(   h->luma_weight[list][i] != luma_def
3183                    || h->luma_offset[list][i] != 0)
3184                     h->use_weight= 1;
3185             }else{
3186                 h->luma_weight[list][i]= luma_def;
3187                 h->luma_offset[list][i]= 0;
3188             }
3189
3190             chroma_weight_flag= get_bits1(&s->gb);
3191             if(chroma_weight_flag){
3192                 int j;
3193                 for(j=0; j<2; j++){
3194                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3195                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3196                     if(   h->chroma_weight[list][i][j] != chroma_def
3197                        || h->chroma_offset[list][i][j] != 0)
3198                         h->use_weight_chroma= 1;
3199                 }
3200             }else{
3201                 int j;
3202                 for(j=0; j<2; j++){
3203                     h->chroma_weight[list][i][j]= chroma_def;
3204                     h->chroma_offset[list][i][j]= 0;
3205                 }
3206             }
3207         }
3208         if(h->slice_type != FF_B_TYPE) break;
3209     }
3210     h->use_weight= h->use_weight || h->use_weight_chroma;
3211     return 0;
3212 }
3213
3214 static void implicit_weight_table(H264Context *h){
3215     MpegEncContext * const s = &h->s;
3216     int ref0, ref1;
3217     int cur_poc = s->current_picture_ptr->poc;
3218
3219     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3220        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3221         h->use_weight= 0;
3222         h->use_weight_chroma= 0;
3223         return;
3224     }
3225
3226     h->use_weight= 2;
3227     h->use_weight_chroma= 2;
3228     h->luma_log2_weight_denom= 5;
3229     h->chroma_log2_weight_denom= 5;
3230
3231     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3232         int poc0 = h->ref_list[0][ref0].poc;
3233         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3234             int poc1 = h->ref_list[1][ref1].poc;
3235             int td = av_clip(poc1 - poc0, -128, 127);
3236             if(td){
3237                 int tb = av_clip(cur_poc - poc0, -128, 127);
3238                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3239                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3240                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3241                     h->implicit_weight[ref0][ref1] = 32;
3242                 else
3243                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3244             }else
3245                 h->implicit_weight[ref0][ref1] = 32;
3246         }
3247     }
3248 }
3249
3250 /**
3251  * Mark a picture as no longer needed for reference. The refmask
3252  * argument allows unreferencing of individual fields or the whole frame.
3253  * If the picture becomes entirely unreferenced, but is being held for
3254  * display purposes, it is marked as such.
3255  * @param refmask mask of fields to unreference; the mask is bitwise
3256  *                anded with the reference marking of pic
3257  * @return non-zero if pic becomes entirely unreferenced (except possibly
3258  *         for display purposes) zero if one of the fields remains in
3259  *         reference
3260  */
3261 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3262     int i;
3263     if (pic->reference &= refmask) {
3264         return 0;
3265     } else {
3266         if(pic == h->delayed_output_pic)
3267             pic->reference=DELAYED_PIC_REF;
3268         else{
3269             for(i = 0; h->delayed_pic[i]; i++)
3270                 if(pic == h->delayed_pic[i]){
3271                     pic->reference=DELAYED_PIC_REF;
3272                     break;
3273                 }
3274         }
3275         return 1;
3276     }
3277 }
3278
3279 /**
3280  * instantaneous decoder refresh.
3281  */
3282 static void idr(H264Context *h){
3283     int i;
3284
3285     for(i=0; i<16; i++){
3286         if (h->long_ref[i] != NULL) {
3287             unreference_pic(h, h->long_ref[i], 0);
3288             h->long_ref[i]= NULL;
3289         }
3290     }
3291     h->long_ref_count=0;
3292
3293     for(i=0; i<h->short_ref_count; i++){
3294         unreference_pic(h, h->short_ref[i], 0);
3295         h->short_ref[i]= NULL;
3296     }
3297     h->short_ref_count=0;
3298 }
3299
3300 /* forget old pics after a seek */
3301 static void flush_dpb(AVCodecContext *avctx){
3302     H264Context *h= avctx->priv_data;
3303     int i;
3304     for(i=0; i<16; i++) {
3305         if(h->delayed_pic[i])
3306             h->delayed_pic[i]->reference= 0;
3307         h->delayed_pic[i]= NULL;
3308     }
3309     if(h->delayed_output_pic)
3310         h->delayed_output_pic->reference= 0;
3311     h->delayed_output_pic= NULL;
3312     idr(h);
3313     if(h->s.current_picture_ptr)
3314         h->s.current_picture_ptr->reference= 0;
3315     h->s.first_field= 0;
3316     ff_mpeg_flush(avctx);
3317 }
3318
3319 /**
3320  * Find a Picture in the short term reference list by frame number.
3321  * @param frame_num frame number to search for
3322  * @param idx the index into h->short_ref where returned picture is found
3323  *            undefined if no picture found.
3324  * @return pointer to the found picture, or NULL if no pic with the provided
3325  *                 frame number is found
3326  */
3327 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3328     MpegEncContext * const s = &h->s;
3329     int i;
3330
3331     for(i=0; i<h->short_ref_count; i++){
3332         Picture *pic= h->short_ref[i];
3333         if(s->avctx->debug&FF_DEBUG_MMCO)
3334             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3335         if(pic->frame_num == frame_num) {
3336             *idx = i;
3337             return pic;
3338         }
3339     }
3340     return NULL;
3341 }
3342
3343 /**
3344  * Remove a picture from the short term reference list by its index in
3345  * that list.  This does no checking on the provided index; it is assumed
3346  * to be valid. Other list entries are shifted down.
3347  * @param i index into h->short_ref of picture to remove.
3348  */
3349 static void remove_short_at_index(H264Context *h, int i){
3350     assert(i > 0 && i < h->short_ref_count);
3351     h->short_ref[i]= NULL;
3352     if (--h->short_ref_count)
3353         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3354 }
3355
3356 /**
3357  *
3358  * @return the removed picture or NULL if an error occurs
3359  */
3360 static Picture * remove_short(H264Context *h, int frame_num){
3361     MpegEncContext * const s = &h->s;
3362     Picture *pic;
3363     int i;
3364
3365     if(s->avctx->debug&FF_DEBUG_MMCO)
3366         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3367
3368     pic = find_short(h, frame_num, &i);
3369     if (pic)
3370         remove_short_at_index(h, i);
3371
3372     return pic;
3373 }
3374
3375 /**
3376  * Remove a picture from the long term reference list by its index in
3377  * that list.  This does no checking on the provided index; it is assumed
3378  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3379  * @param i index into h->long_ref of picture to remove.
3380  */
3381 static void remove_long_at_index(H264Context *h, int i){
3382     h->long_ref[i]= NULL;
3383     h->long_ref_count--;
3384 }
3385
3386 /**
3387  *
3388  * @return the removed picture or NULL if an error occurs
3389  */
3390 static Picture * remove_long(H264Context *h, int i){
3391     Picture *pic;
3392
3393     pic= h->long_ref[i];
3394     if (pic)
3395         remove_long_at_index(h, i);
3396
3397     return pic;
3398 }
3399
3400 /**
3401  * print short term list
3402  */
3403 static void print_short_term(H264Context *h) {
3404     uint32_t i;
3405     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3406         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3407         for(i=0; i<h->short_ref_count; i++){
3408             Picture *pic= h->short_ref[i];
3409             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3410         }
3411     }
3412 }
3413
3414 /**
3415  * print long term list
3416  */
3417 static void print_long_term(H264Context *h) {
3418     uint32_t i;
3419     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3420         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3421         for(i = 0; i < 16; i++){
3422             Picture *pic= h->long_ref[i];
3423             if (pic) {
3424                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3425             }
3426         }
3427     }
3428 }
3429
3430 /**
3431  * Executes the reference picture marking (memory management control operations).
3432  */
3433 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3434     MpegEncContext * const s = &h->s;
3435     int i, j;
3436     int current_ref_assigned=0;
3437     Picture *pic;
3438
3439     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3440         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3441
3442     for(i=0; i<mmco_count; i++){
3443         int structure, frame_num, unref_pic;
3444         if(s->avctx->debug&FF_DEBUG_MMCO)
3445             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3446
3447         switch(mmco[i].opcode){
3448         case MMCO_SHORT2UNUSED:
3449             if(s->avctx->debug&FF_DEBUG_MMCO)
3450                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3451             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3452             pic = find_short(h, frame_num, &j);
3453             if (pic) {
3454                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3455                     remove_short_at_index(h, j);
3456             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3457                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3458             break;
3459         case MMCO_SHORT2LONG:
3460             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3461                     h->long_ref[mmco[i].long_arg]->frame_num ==
3462                                               mmco[i].short_pic_num / 2) {
3463                 /* do nothing, we've already moved this field pair. */
3464             } else {
3465                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3466
3467                 pic= remove_long(h, mmco[i].long_arg);
3468                 if(pic) unreference_pic(h, pic, 0);
3469
3470                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3471                 if (h->long_ref[ mmco[i].long_arg ]){
3472                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3473                     h->long_ref_count++;
3474                 }
3475             }
3476             break;
3477         case MMCO_LONG2UNUSED:
3478             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3479             pic = h->long_ref[j];
3480             if (pic) {
3481                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3482                     remove_long_at_index(h, j);
3483             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3484                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3485             break;
3486         case MMCO_LONG:
3487             unref_pic = 1;
3488             if (FIELD_PICTURE && !s->first_field) {
3489                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3490                     /* Just mark second field as referenced */
3491                     unref_pic = 0;
3492                 } else if (s->current_picture_ptr->reference) {
3493                     /* First field in pair is in short term list or
3494                      * at a different long term index.
3495                      * This is not allowed; see 7.4.3, notes 2 and 3.
3496                      * Report the problem and keep the pair where it is,
3497                      * and mark this field valid.
3498                      */
3499                     av_log(h->s.avctx, AV_LOG_ERROR,
3500                         "illegal long term reference assignment for second "
3501                         "field in complementary field pair (first field is "
3502                         "short term or has non-matching long index)\n");
3503                     unref_pic = 0;
3504                 }
3505             }
3506
3507             if (unref_pic) {
3508                 pic= remove_long(h, mmco[i].long_arg);
3509                 if(pic) unreference_pic(h, pic, 0);
3510
3511                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3512                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3513                 h->long_ref_count++;
3514             }
3515
3516             s->current_picture_ptr->reference |= s->picture_structure;
3517             current_ref_assigned=1;
3518             break;
3519         case MMCO_SET_MAX_LONG:
3520             assert(mmco[i].long_arg <= 16);
3521             // just remove the long term which index is greater than new max
3522             for(j = mmco[i].long_arg; j<16; j++){
3523                 pic = remove_long(h, j);
3524                 if (pic) unreference_pic(h, pic, 0);
3525             }
3526             break;
3527         case MMCO_RESET:
3528             while(h->short_ref_count){
3529                 pic= remove_short(h, h->short_ref[0]->frame_num);
3530                 if(pic) unreference_pic(h, pic, 0);
3531             }
3532             for(j = 0; j < 16; j++) {
3533                 pic= remove_long(h, j);
3534                 if(pic) unreference_pic(h, pic, 0);
3535             }
3536             break;
3537         default: assert(0);
3538         }
3539     }
3540
3541     if (!current_ref_assigned && FIELD_PICTURE &&
3542             !s->first_field && s->current_picture_ptr->reference) {
3543
3544         /* Second field of complementary field pair; the first field of
3545          * which is already referenced. If short referenced, it
3546          * should be first entry in short_ref. If not, it must exist
3547          * in long_ref; trying to put it on the short list here is an
3548          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3549          */
3550         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3551             /* Just mark the second field valid */
3552             s->current_picture_ptr->reference = PICT_FRAME;
3553         } else if (s->current_picture_ptr->long_ref) {
3554             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3555                                              "assignment for second field "
3556                                              "in complementary field pair "
3557                                              "(first field is long term)\n");
3558         } else {
3559             /*
3560              * First field in reference, but not in any sensible place on our
3561              * reference lists. This shouldn't happen unless reference
3562              * handling somewhere else is wrong.
3563              */
3564             assert(0);
3565         }
3566         current_ref_assigned = 1;
3567     }
3568
3569     if(!current_ref_assigned){
3570         pic= remove_short(h, s->current_picture_ptr->frame_num);
3571         if(pic){
3572             unreference_pic(h, pic, 0);
3573             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3574         }
3575
3576         if(h->short_ref_count)
3577             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3578
3579         h->short_ref[0]= s->current_picture_ptr;
3580         h->short_ref[0]->long_ref=0;
3581         h->short_ref_count++;
3582         s->current_picture_ptr->reference |= s->picture_structure;
3583     }
3584
3585     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3586
3587         /* We have too many reference frames, probably due to corrupted
3588          * stream. Need to discard one frame. Prevents overrun of the
3589          * short_ref and long_ref buffers.
3590          */
3591         av_log(h->s.avctx, AV_LOG_ERROR,
3592                "number of reference frames exceeds max (probably "
3593                "corrupt input), discarding one\n");
3594
3595         if (h->long_ref_count) {
3596             for (i = 0; i < 16; ++i)
3597                 if (h->long_ref[i])
3598                     break;
3599
3600             assert(i < 16);
3601             pic = h->long_ref[i];
3602             remove_long_at_index(h, i);
3603         } else {
3604             pic = h->short_ref[h->short_ref_count - 1];
3605             remove_short_at_index(h, h->short_ref_count - 1);
3606         }
3607         unreference_pic(h, pic, 0);
3608     }
3609
3610     print_short_term(h);
3611     print_long_term(h);
3612     return 0;
3613 }
3614
3615 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3616     MpegEncContext * const s = &h->s;
3617     int i;
3618
3619     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3620         s->broken_link= get_bits1(gb) -1;
3621         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3622         if(h->mmco[0].long_arg == -1)
3623             h->mmco_index= 0;
3624         else{
3625             h->mmco[0].opcode= MMCO_LONG;
3626             h->mmco_index= 1;
3627         }
3628     }else{
3629         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3630             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3631                 MMCOOpcode opcode= get_ue_golomb(gb);
3632
3633                 h->mmco[i].opcode= opcode;
3634                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3635                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3636 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3637                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3638                         return -1;
3639                     }*/
3640                 }
3641                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3642                     unsigned int long_arg= get_ue_golomb(gb);
3643                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3644                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3645                         return -1;
3646                     }
3647                     h->mmco[i].long_arg= long_arg;
3648                 }
3649
3650                 if(opcode > (unsigned)MMCO_LONG){
3651                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3652                     return -1;
3653                 }
3654                 if(opcode == MMCO_END)
3655                     break;
3656             }
3657             h->mmco_index= i;
3658         }else{
3659             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3660
3661             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3662                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3663                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3664                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3665                 h->mmco_index= 1;
3666                 if (FIELD_PICTURE) {
3667                     h->mmco[0].short_pic_num *= 2;
3668                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3669                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3670                     h->mmco_index= 2;
3671                 }
3672             }else
3673                 h->mmco_index= 0;
3674         }
3675     }
3676
3677     return 0;
3678 }
3679
3680 static int init_poc(H264Context *h){
3681     MpegEncContext * const s = &h->s;
3682     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3683     int field_poc[2];
3684
3685     if(h->nal_unit_type == NAL_IDR_SLICE){
3686         h->frame_num_offset= 0;
3687     }else{
3688         if(h->frame_num < h->prev_frame_num)
3689             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3690         else
3691             h->frame_num_offset= h->prev_frame_num_offset;
3692     }
3693
3694     if(h->sps.poc_type==0){
3695         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3696
3697         if(h->nal_unit_type == NAL_IDR_SLICE){
3698              h->prev_poc_msb=
3699              h->prev_poc_lsb= 0;
3700         }
3701
3702         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3703             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3704         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3705             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3706         else
3707             h->poc_msb = h->prev_poc_msb;
3708 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3709         field_poc[0] =
3710         field_poc[1] = h->poc_msb + h->poc_lsb;
3711         if(s->picture_structure == PICT_FRAME)
3712             field_poc[1] += h->delta_poc_bottom;
3713     }else if(h->sps.poc_type==1){
3714         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3715         int i;
3716
3717         if(h->sps.poc_cycle_length != 0)
3718             abs_frame_num = h->frame_num_offset + h->frame_num;
3719         else
3720             abs_frame_num = 0;
3721
3722         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3723             abs_frame_num--;
3724
3725         expected_delta_per_poc_cycle = 0;
3726         for(i=0; i < h->sps.poc_cycle_length; i++)
3727             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3728
3729         if(abs_frame_num > 0){
3730             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3731             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3732
3733             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3734             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3735                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3736         } else
3737             expectedpoc = 0;
3738
3739         if(h->nal_ref_idc == 0)
3740             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3741
3742         field_poc[0] = expectedpoc + h->delta_poc[0];
3743         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3744
3745         if(s->picture_structure == PICT_FRAME)
3746             field_poc[1] += h->delta_poc[1];
3747     }else{
3748         int poc;
3749         if(h->nal_unit_type == NAL_IDR_SLICE){
3750             poc= 0;
3751         }else{
3752             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3753             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3754         }
3755         field_poc[0]= poc;
3756         field_poc[1]= poc;
3757     }
3758
3759     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3760         s->current_picture_ptr->field_poc[0]= field_poc[0];
3761         s->current_picture_ptr->poc = field_poc[0];
3762     }
3763     if(s->picture_structure != PICT_TOP_FIELD) {
3764         s->current_picture_ptr->field_poc[1]= field_poc[1];
3765         s->current_picture_ptr->poc = field_poc[1];
3766     }
3767     if(!FIELD_PICTURE || !s->first_field) {
3768         Picture *cur = s->current_picture_ptr;
3769         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3770     }
3771
3772     return 0;
3773 }
3774
3775
3776 /**
3777  * initialize scan tables
3778  */
3779 static void init_scan_tables(H264Context *h){
3780     MpegEncContext * const s = &h->s;
3781     int i;
3782     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3783         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3784         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3785     }else{
3786         for(i=0; i<16; i++){
3787 #define T(x) (x>>2) | ((x<<2) & 0xF)
3788             h->zigzag_scan[i] = T(zigzag_scan[i]);
3789             h-> field_scan[i] = T( field_scan[i]);
3790 #undef T
3791         }
3792     }
3793     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3794         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3795         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3796         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3797         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3798     }else{
3799         for(i=0; i<64; i++){
3800 #define T(x) (x>>3) | ((x&7)<<3)
3801             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3802             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3803             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3804             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3805 #undef T
3806         }
3807     }
3808     if(h->sps.transform_bypass){ //FIXME same ugly
3809         h->zigzag_scan_q0          = zigzag_scan;
3810         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3811         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3812         h->field_scan_q0           = field_scan;
3813         h->field_scan8x8_q0        = field_scan8x8;
3814         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3815     }else{
3816         h->zigzag_scan_q0          = h->zigzag_scan;
3817         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3818         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3819         h->field_scan_q0           = h->field_scan;
3820         h->field_scan8x8_q0        = h->field_scan8x8;
3821         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3822     }
3823 }
3824
3825 /**
3826  * Replicates H264 "master" context to thread contexts.
3827  */
3828 static void clone_slice(H264Context *dst, H264Context *src)
3829 {
3830     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3831     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3832     dst->s.current_picture      = src->s.current_picture;
3833     dst->s.linesize             = src->s.linesize;
3834     dst->s.uvlinesize           = src->s.uvlinesize;
3835     dst->s.first_field          = src->s.first_field;
3836
3837     dst->prev_poc_msb           = src->prev_poc_msb;
3838     dst->prev_poc_lsb           = src->prev_poc_lsb;
3839     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3840     dst->prev_frame_num         = src->prev_frame_num;
3841     dst->short_ref_count        = src->short_ref_count;
3842
3843     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3844     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3845     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3846     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3847
3848     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3849     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3850 }
3851
3852 /**
3853  * decodes a slice header.
3854  * This will also call MPV_common_init() and frame_start() as needed.
3855  *
3856  * @param h h264context
3857  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3858  *
3859  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3860  */
3861 static int decode_slice_header(H264Context *h, H264Context *h0){
3862     MpegEncContext * const s = &h->s;
3863     MpegEncContext * const s0 = &h0->s;
3864     unsigned int first_mb_in_slice;
3865     unsigned int pps_id;
3866     int num_ref_idx_active_override_flag;
3867     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3868     unsigned int slice_type, tmp, i;
3869     int default_ref_list_done = 0;
3870     int last_pic_structure;
3871
3872     s->dropable= h->nal_ref_idc == 0;
3873
3874     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3875         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3876         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3877     }else{
3878         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3879         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3880     }
3881
3882     first_mb_in_slice= get_ue_golomb(&s->gb);
3883
3884     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3885         h0->current_slice = 0;
3886         if (!s0->first_field)
3887             s->current_picture_ptr= NULL;
3888     }
3889
3890     slice_type= get_ue_golomb(&s->gb);
3891     if(slice_type > 9){
3892         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3893         return -1;
3894     }
3895     if(slice_type > 4){
3896         slice_type -= 5;
3897         h->slice_type_fixed=1;
3898     }else
3899         h->slice_type_fixed=0;
3900
3901     slice_type= slice_type_map[ slice_type ];
3902     if (slice_type == FF_I_TYPE
3903         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3904         default_ref_list_done = 1;
3905     }
3906     h->slice_type= slice_type;
3907
3908     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3909     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3910         av_log(h->s.avctx, AV_LOG_ERROR,
3911                "B picture before any references, skipping\n");
3912         return -1;
3913     }
3914
3915     pps_id= get_ue_golomb(&s->gb);
3916     if(pps_id>=MAX_PPS_COUNT){
3917         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3918         return -1;
3919     }
3920     if(!h0->pps_buffers[pps_id]) {
3921         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3922         return -1;
3923     }
3924     h->pps= *h0->pps_buffers[pps_id];
3925
3926     if(!h0->sps_buffers[h->pps.sps_id]) {
3927         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3928         return -1;
3929     }
3930     h->sps = *h0->sps_buffers[h->pps.sps_id];
3931
3932     if(h == h0 && h->dequant_coeff_pps != pps_id){
3933         h->dequant_coeff_pps = pps_id;
3934         init_dequant_tables(h);
3935     }
3936
3937     s->mb_width= h->sps.mb_width;
3938     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3939
3940     h->b_stride=  s->mb_width*4;
3941     h->b8_stride= s->mb_width*2;
3942
3943     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3944     if(h->sps.frame_mbs_only_flag)
3945         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3946     else
3947         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3948
3949     if (s->context_initialized
3950         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3951         if(h != h0)
3952             return -1;   // width / height changed during parallelized decoding
3953         free_tables(h);
3954         MPV_common_end(s);
3955     }
3956     if (!s->context_initialized) {
3957         if(h != h0)
3958             return -1;  // we cant (re-)initialize context during parallel decoding
3959         if (MPV_common_init(s) < 0)
3960             return -1;
3961         s->first_field = 0;
3962
3963         init_scan_tables(h);
3964         alloc_tables(h);
3965
3966         for(i = 1; i < s->avctx->thread_count; i++) {
3967             H264Context *c;
3968             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3969             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3970             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3971             c->sps = h->sps;
3972             c->pps = h->pps;
3973             init_scan_tables(c);
3974             clone_tables(c, h);
3975         }
3976
3977         for(i = 0; i < s->avctx->thread_count; i++)
3978             if(context_init(h->thread_context[i]) < 0)
3979                 return -1;
3980
3981         s->avctx->width = s->width;
3982         s->avctx->height = s->height;
3983         s->avctx->sample_aspect_ratio= h->sps.sar;
3984         if(!s->avctx->sample_aspect_ratio.den)
3985             s->avctx->sample_aspect_ratio.den = 1;
3986
3987         if(h->sps.timing_info_present_flag){
3988             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3989             if(h->x264_build > 0 && h->x264_build < 44)
3990                 s->avctx->time_base.den *= 2;
3991             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3992                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3993         }
3994     }
3995
3996     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3997
3998     h->mb_mbaff = 0;
3999     h->mb_aff_frame = 0;
4000     last_pic_structure = s0->picture_structure;
4001     if(h->sps.frame_mbs_only_flag){
4002         s->picture_structure= PICT_FRAME;
4003     }else{
4004         if(get_bits1(&s->gb)) { //field_pic_flag
4005             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4006         } else {
4007             s->picture_structure= PICT_FRAME;
4008             h->mb_aff_frame = h->sps.mb_aff;
4009         }
4010     }
4011
4012     if(h0->current_slice == 0){
4013         /* See if we have a decoded first field looking for a pair... */
4014         if (s0->first_field) {
4015             assert(s0->current_picture_ptr);
4016             assert(s0->current_picture_ptr->data[0]);
4017             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4018
4019             /* figure out if we have a complementary field pair */
4020             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4021                 /*
4022                  * Previous field is unmatched. Don't display it, but let it
4023                  * remain for reference if marked as such.
4024                  */
4025                 s0->current_picture_ptr = NULL;
4026                 s0->first_field = FIELD_PICTURE;
4027
4028             } else {
4029                 if (h->nal_ref_idc &&
4030                         s0->current_picture_ptr->reference &&
4031                         s0->current_picture_ptr->frame_num != h->frame_num) {
4032                     /*
4033                      * This and previous field were reference, but had
4034                      * different frame_nums. Consider this field first in
4035                      * pair. Throw away previous field except for reference
4036                      * purposes.
4037                      */
4038                     s0->first_field = 1;
4039                     s0->current_picture_ptr = NULL;
4040
4041                 } else {
4042                     /* Second field in complementary pair */
4043                     s0->first_field = 0;
4044                 }
4045             }
4046
4047         } else {
4048             /* Frame or first field in a potentially complementary pair */
4049             assert(!s0->current_picture_ptr);
4050             s0->first_field = FIELD_PICTURE;
4051         }
4052
4053         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4054             s0->first_field = 0;
4055             return -1;
4056         }
4057     }
4058     if(h != h0)
4059         clone_slice(h, h0);
4060
4061     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4062
4063     assert(s->mb_num == s->mb_width * s->mb_height);
4064     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4065        first_mb_in_slice                    >= s->mb_num){
4066         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4067         return -1;
4068     }
4069     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4070     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4071     if (s->picture_structure == PICT_BOTTOM_FIELD)
4072         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4073     assert(s->mb_y < s->mb_height);
4074
4075     if(s->picture_structure==PICT_FRAME){
4076         h->curr_pic_num=   h->frame_num;
4077         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4078     }else{
4079         h->curr_pic_num= 2*h->frame_num + 1;
4080         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4081     }
4082
4083     if(h->nal_unit_type == NAL_IDR_SLICE){
4084         get_ue_golomb(&s->gb); /* idr_pic_id */
4085     }
4086
4087     if(h->sps.poc_type==0){
4088         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4089
4090         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4091             h->delta_poc_bottom= get_se_golomb(&s->gb);
4092         }
4093     }
4094
4095     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4096         h->delta_poc[0]= get_se_golomb(&s->gb);
4097
4098         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4099             h->delta_poc[1]= get_se_golomb(&s->gb);
4100     }
4101
4102     init_poc(h);
4103
4104     if(h->pps.redundant_pic_cnt_present){
4105         h->redundant_pic_count= get_ue_golomb(&s->gb);
4106     }
4107
4108     //set defaults, might be overriden a few line later
4109     h->ref_count[0]= h->pps.ref_count[0];
4110     h->ref_count[1]= h->pps.ref_count[1];
4111
4112     if(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE || h->slice_type == FF_B_TYPE){
4113         if(h->slice_type == FF_B_TYPE){
4114             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4115             if(FIELD_PICTURE && h->direct_spatial_mv_pred)
4116                 av_log(h->s.avctx, AV_LOG_ERROR, "PAFF + spatial direct mode is not implemented\n");
4117         }
4118         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4119
4120         if(num_ref_idx_active_override_flag){
4121             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4122             if(h->slice_type==FF_B_TYPE)
4123                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4124
4125             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4126                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4127                 h->ref_count[0]= h->ref_count[1]= 1;
4128                 return -1;
4129             }
4130         }
4131         if(h->slice_type == FF_B_TYPE)
4132             h->list_count= 2;
4133         else
4134             h->list_count= 1;
4135     }else
4136         h->list_count= 0;
4137
4138     if(!default_ref_list_done){
4139         fill_default_ref_list(h);
4140     }
4141
4142     if(decode_ref_pic_list_reordering(h) < 0)
4143         return -1;
4144
4145     if(   (h->pps.weighted_pred          && (h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE ))
4146        || (h->pps.weighted_bipred_idc==1 && h->slice_type==FF_B_TYPE ) )
4147         pred_weight_table(h);
4148     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==FF_B_TYPE)
4149         implicit_weight_table(h);
4150     else
4151         h->use_weight = 0;
4152
4153     if(h->nal_ref_idc)
4154         decode_ref_pic_marking(h0, &s->gb);
4155
4156     if(FRAME_MBAFF)
4157         fill_mbaff_ref_list(h);
4158
4159     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE && h->pps.cabac ){
4160         tmp = get_ue_golomb(&s->gb);
4161         if(tmp > 2){
4162             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4163             return -1;
4164         }
4165         h->cabac_init_idc= tmp;
4166     }
4167
4168     h->last_qscale_diff = 0;
4169     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4170     if(tmp>51){
4171         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4172         return -1;
4173     }
4174     s->qscale= tmp;
4175     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4176     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4177     //FIXME qscale / qp ... stuff
4178     if(h->slice_type == FF_SP_TYPE){
4179         get_bits1(&s->gb); /* sp_for_switch_flag */
4180     }
4181     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4182         get_se_golomb(&s->gb); /* slice_qs_delta */
4183     }
4184
4185     h->deblocking_filter = 1;
4186     h->slice_alpha_c0_offset = 0;
4187     h->slice_beta_offset = 0;
4188     if( h->pps.deblocking_filter_parameters_present ) {
4189         tmp= get_ue_golomb(&s->gb);
4190         if(tmp > 2){
4191             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4192             return -1;
4193         }
4194         h->deblocking_filter= tmp;
4195         if(h->deblocking_filter < 2)
4196             h->deblocking_filter^= 1; // 1<->0
4197
4198         if( h->deblocking_filter ) {
4199             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4200             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4201         }
4202     }
4203
4204     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4205        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != FF_I_TYPE)
4206        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == FF_B_TYPE)
4207        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4208         h->deblocking_filter= 0;
4209
4210     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4211         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4212             /* Cheat slightly for speed:
4213                Do not bother to deblock across slices. */
4214             h->deblocking_filter = 2;
4215         } else {
4216             h0->max_contexts = 1;
4217             if(!h0->single_decode_warning) {
4218                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4219                 h0->single_decode_warning = 1;
4220             }
4221             if(h != h0)
4222                 return 1; // deblocking switched inside frame
4223         }
4224     }
4225
4226 #if 0 //FMO
4227     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4228         slice_group_change_cycle= get_bits(&s->gb, ?);
4229 #endif
4230
4231     h0->last_slice_type = slice_type;
4232     h->slice_num = ++h0->current_slice;
4233
4234     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4235     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4236
4237     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4238         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4239                h->slice_num,
4240                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4241                first_mb_in_slice,
4242                av_get_pict_type_char(h->slice_type),
4243                pps_id, h->frame_num,
4244                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4245                h->ref_count[0], h->ref_count[1],
4246                s->qscale,
4247                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4248                h->use_weight,
4249                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4250                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4251                );
4252     }
4253
4254     return 0;
4255 }
4256
4257 /**
4258  *
4259  */
4260 static inline int get_level_prefix(GetBitContext *gb){
4261     unsigned int buf;
4262     int log;
4263
4264     OPEN_READER(re, gb);
4265     UPDATE_CACHE(re, gb);
4266     buf=GET_CACHE(re, gb);
4267
4268     log= 32 - av_log2(buf);
4269 #ifdef TRACE
4270     print_bin(buf>>(32-log), log);
4271     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4272 #endif
4273
4274     LAST_SKIP_BITS(re, gb, log);
4275     CLOSE_READER(re, gb);
4276
4277     return log-1;
4278 }
4279
4280 static inline int get_dct8x8_allowed(H264Context *h){
4281     int i;
4282     for(i=0; i<4; i++){
4283         if(!IS_SUB_8X8(h->sub_mb_type[i])
4284            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4285             return 0;
4286     }
4287     return 1;
4288 }
4289
4290 /**
4291  * decodes a residual block.
4292  * @param n block index
4293  * @param scantable scantable
4294  * @param max_coeff number of coefficients in the block
4295  * @return <0 if an error occurred
4296  */
4297 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4298     MpegEncContext * const s = &h->s;
4299     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4300     int level[16];
4301     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4302
4303     //FIXME put trailing_onex into the context
4304
4305     if(n == CHROMA_DC_BLOCK_INDEX){
4306         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4307         total_coeff= coeff_token>>2;
4308     }else{
4309         if(n == LUMA_DC_BLOCK_INDEX){
4310             total_coeff= pred_non_zero_count(h, 0);
4311             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4312             total_coeff= coeff_token>>2;
4313         }else{
4314             total_coeff= pred_non_zero_count(h, n);
4315             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4316             total_coeff= coeff_token>>2;
4317             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4318         }
4319     }
4320
4321     //FIXME set last_non_zero?
4322
4323     if(total_coeff==0)
4324         return 0;
4325     if(total_coeff > (unsigned)max_coeff) {
4326         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4327         return -1;
4328     }
4329
4330     trailing_ones= coeff_token&3;
4331     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4332     assert(total_coeff<=16);
4333
4334     for(i=0; i<trailing_ones; i++){
4335         level[i]= 1 - 2*get_bits1(gb);
4336     }
4337
4338     if(i<total_coeff) {
4339         int level_code, mask;
4340         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4341         int prefix= get_level_prefix(gb);
4342
4343         //first coefficient has suffix_length equal to 0 or 1
4344         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4345             if(suffix_length)
4346                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4347             else
4348                 level_code= (prefix<<suffix_length); //part
4349         }else if(prefix==14){
4350             if(suffix_length)
4351                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4352             else
4353                 level_code= prefix + get_bits(gb, 4); //part
4354         }else if(prefix==15){
4355             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4356             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4357         }else{
4358             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4359             return -1;
4360         }
4361
4362         if(trailing_ones < 3) level_code += 2;
4363
4364         suffix_length = 1;
4365         if(level_code > 5)
4366             suffix_length++;
4367         mask= -(level_code&1);
4368         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4369         i++;
4370
4371         //remaining coefficients have suffix_length > 0
4372         for(;i<total_coeff;i++) {
4373             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4374             prefix = get_level_prefix(gb);
4375             if(prefix<15){
4376                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4377             }else if(prefix==15){
4378                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4379             }else{
4380                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4381                 return -1;
4382             }
4383             mask= -(level_code&1);
4384             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4385             if(level_code > suffix_limit[suffix_length])
4386                 suffix_length++;
4387         }
4388     }
4389
4390     if(total_coeff == max_coeff)
4391         zeros_left=0;
4392     else{
4393         if(n == CHROMA_DC_BLOCK_INDEX)
4394             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4395         else
4396             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4397     }
4398
4399     coeff_num = zeros_left + total_coeff - 1;
4400     j = scantable[coeff_num];
4401     if(n > 24){
4402         block[j] = level[0];
4403         for(i=1;i<total_coeff;i++) {
4404             if(zeros_left <= 0)
4405                 run_before = 0;
4406             else if(zeros_left < 7){
4407                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4408             }else{
4409                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4410             }
4411             zeros_left -= run_before;
4412             coeff_num -= 1 + run_before;
4413             j= scantable[ coeff_num ];
4414
4415             block[j]= level[i];
4416         }
4417     }else{
4418         block[j] = (level[0] * qmul[j] + 32)>>6;
4419         for(i=1;i<total_coeff;i++) {
4420             if(zeros_left <= 0)
4421                 run_before = 0;
4422             else if(zeros_left < 7){
4423                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4424             }else{
4425                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4426             }
4427             zeros_left -= run_before;
4428             coeff_num -= 1 + run_before;
4429             j= scantable[ coeff_num ];
4430
4431             block[j]= (level[i] * qmul[j] + 32)>>6;
4432         }
4433     }
4434
4435     if(zeros_left<0){
4436         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4437         return -1;
4438     }
4439
4440     return 0;
4441 }
4442
4443 static void predict_field_decoding_flag(H264Context *h){
4444     MpegEncContext * const s = &h->s;
4445     const int mb_xy= h->mb_xy;
4446     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4447                 ? s->current_picture.mb_type[mb_xy-1]
4448                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4449                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4450                 : 0;
4451     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4452 }
4453
4454 /**
4455  * decodes a P_SKIP or B_SKIP macroblock
4456  */
4457 static void decode_mb_skip(H264Context *h){
4458     MpegEncContext * const s = &h->s;
4459     const int mb_xy= h->mb_xy;
4460     int mb_type=0;
4461
4462     memset(h->non_zero_count[mb_xy], 0, 16);
4463     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4464
4465     if(MB_FIELD)
4466         mb_type|= MB_TYPE_INTERLACED;
4467
4468     if( h->slice_type == FF_B_TYPE )
4469     {
4470         // just for fill_caches. pred_direct_motion will set the real mb_type
4471         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4472
4473         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4474         pred_direct_motion(h, &mb_type);
4475         mb_type|= MB_TYPE_SKIP;
4476     }
4477     else
4478     {
4479         int mx, my;
4480         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4481
4482         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4483         pred_pskip_motion(h, &mx, &my);
4484         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4485         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4486     }
4487
4488     write_back_motion(h, mb_type);
4489     s->current_picture.mb_type[mb_xy]= mb_type;
4490     s->current_picture.qscale_table[mb_xy]= s->qscale;
4491     h->slice_table[ mb_xy ]= h->slice_num;
4492     h->prev_mb_skipped= 1;
4493 }
4494
4495 /**
4496  * decodes a macroblock
4497  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4498  */
4499 static int decode_mb_cavlc(H264Context *h){
4500     MpegEncContext * const s = &h->s;
4501     int mb_xy;
4502     int partition_count;
4503     unsigned int mb_type, cbp;
4504     int dct8x8_allowed= h->pps.transform_8x8_mode;
4505
4506     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4507
4508     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4509
4510     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4511     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4512                 down the code */
4513     if(h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE){
4514         if(s->mb_skip_run==-1)
4515             s->mb_skip_run= get_ue_golomb(&s->gb);
4516
4517         if (s->mb_skip_run--) {
4518             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4519                 if(s->mb_skip_run==0)
4520                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4521                 else
4522                     predict_field_decoding_flag(h);
4523             }
4524             decode_mb_skip(h);
4525             return 0;
4526         }
4527     }
4528     if(FRAME_MBAFF){
4529         if( (s->mb_y&1) == 0 )
4530             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4531     }else
4532         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4533
4534     h->prev_mb_skipped= 0;
4535
4536     mb_type= get_ue_golomb(&s->gb);
4537     if(h->slice_type == FF_B_TYPE){
4538         if(mb_type < 23){
4539             partition_count= b_mb_type_info[mb_type].partition_count;
4540             mb_type=         b_mb_type_info[mb_type].type;
4541         }else{
4542             mb_type -= 23;
4543             goto decode_intra_mb;
4544         }
4545     }else if(h->slice_type == FF_P_TYPE /*|| h->slice_type == FF_SP_TYPE */){
4546         if(mb_type < 5){
4547             partition_count= p_mb_type_info[mb_type].partition_count;
4548             mb_type=         p_mb_type_info[mb_type].type;
4549         }else{
4550             mb_type -= 5;
4551             goto decode_intra_mb;
4552         }
4553     }else{
4554        assert(h->slice_type == FF_I_TYPE);
4555 decode_intra_mb:
4556         if(mb_type > 25){
4557             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4558             return -1;
4559         }
4560         partition_count=0;
4561         cbp= i_mb_type_info[mb_type].cbp;
4562         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4563         mb_type= i_mb_type_info[mb_type].type;
4564     }
4565
4566     if(MB_FIELD)
4567         mb_type |= MB_TYPE_INTERLACED;
4568
4569     h->slice_table[ mb_xy ]= h->slice_num;
4570
4571     if(IS_INTRA_PCM(mb_type)){
4572         unsigned int x, y;
4573
4574         // We assume these blocks are very rare so we do not optimize it.
4575         align_get_bits(&s->gb);
4576
4577         // The pixels are stored in the same order as levels in h->mb array.
4578         for(y=0; y<16; y++){
4579             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4580             for(x=0; x<16; x++){
4581                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4582                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4583             }
4584         }
4585         for(y=0; y<8; y++){
4586             const int index= 256 + 4*(y&3) + 32*(y>>2);
4587             for(x=0; x<8; x++){
4588                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4589                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4590             }
4591         }
4592         for(y=0; y<8; y++){
4593             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4594             for(x=0; x<8; x++){
4595                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4596                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4597             }
4598         }
4599
4600         // In deblocking, the quantizer is 0
4601         s->current_picture.qscale_table[mb_xy]= 0;
4602         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4603         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4604         // All coeffs are present
4605         memset(h->non_zero_count[mb_xy], 16, 16);
4606
4607         s->current_picture.mb_type[mb_xy]= mb_type;
4608         return 0;
4609     }
4610
4611     if(MB_MBAFF){
4612         h->ref_count[0] <<= 1;
4613         h->ref_count[1] <<= 1;
4614     }
4615
4616     fill_caches(h, mb_type, 0);
4617
4618     //mb_pred
4619     if(IS_INTRA(mb_type)){
4620             int pred_mode;
4621 //            init_top_left_availability(h);
4622             if(IS_INTRA4x4(mb_type)){
4623                 int i;
4624                 int di = 1;
4625                 if(dct8x8_allowed && get_bits1(&s->gb)){
4626                     mb_type |= MB_TYPE_8x8DCT;
4627                     di = 4;
4628                 }
4629
4630 //                fill_intra4x4_pred_table(h);
4631                 for(i=0; i<16; i+=di){
4632                     int mode= pred_intra_mode(h, i);
4633
4634                     if(!get_bits1(&s->gb)){
4635                         const int rem_mode= get_bits(&s->gb, 3);
4636                         mode = rem_mode + (rem_mode >= mode);
4637                     }
4638
4639                     if(di==4)
4640                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4641                     else
4642                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4643                 }
4644                 write_back_intra_pred_mode(h);
4645                 if( check_intra4x4_pred_mode(h) < 0)
4646                     return -1;
4647             }else{
4648                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4649                 if(h->intra16x16_pred_mode < 0)
4650                     return -1;
4651             }
4652
4653             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4654             if(pred_mode < 0)
4655                 return -1;
4656             h->chroma_pred_mode= pred_mode;
4657     }else if(partition_count==4){
4658         int i, j, sub_partition_count[4], list, ref[2][4];
4659
4660         if(h->slice_type == FF_B_TYPE){
4661             for(i=0; i<4; i++){
4662                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4663                 if(h->sub_mb_type[i] >=13){
4664                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4665                     return -1;
4666                 }
4667                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4668                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4669             }
4670             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4671                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4672                 pred_direct_motion(h, &mb_type);
4673                 h->ref_cache[0][scan8[4]] =
4674                 h->ref_cache[1][scan8[4]] =
4675                 h->ref_cache[0][scan8[12]] =
4676                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4677             }
4678         }else{
4679             assert(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE); //FIXME SP correct ?
4680             for(i=0; i<4; i++){
4681                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4682                 if(h->sub_mb_type[i] >=4){
4683                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4684                     return -1;
4685                 }
4686                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4687                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4688             }
4689         }
4690
4691         for(list=0; list<h->list_count; list++){
4692             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4693             for(i=0; i<4; i++){
4694                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4695                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4696                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4697                     if(tmp>=ref_count){
4698                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4699                         return -1;
4700                     }
4701                     ref[list][i]= tmp;
4702                 }else{
4703                  //FIXME
4704                     ref[list][i] = -1;
4705                 }
4706             }
4707         }
4708
4709         if(dct8x8_allowed)
4710             dct8x8_allowed = get_dct8x8_allowed(h);
4711
4712         for(list=0; list<h->list_count; list++){
4713             for(i=0; i<4; i++){
4714                 if(IS_DIRECT(h->sub_mb_type[i])) {
4715                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4716                     continue;
4717                 }
4718                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4719                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4720
4721                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4722                     const int sub_mb_type= h->sub_mb_type[i];
4723                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4724                     for(j=0; j<sub_partition_count[i]; j++){
4725                         int mx, my;
4726                         const int index= 4*i + block_width*j;
4727                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4728                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4729                         mx += get_se_golomb(&s->gb);
4730                         my += get_se_golomb(&s->gb);
4731                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4732
4733                         if(IS_SUB_8X8(sub_mb_type)){
4734                             mv_cache[ 1 ][0]=
4735                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4736                             mv_cache[ 1 ][1]=
4737                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4738                         }else if(IS_SUB_8X4(sub_mb_type)){
4739                             mv_cache[ 1 ][0]= mx;
4740                             mv_cache[ 1 ][1]= my;
4741                         }else if(IS_SUB_4X8(sub_mb_type)){
4742                             mv_cache[ 8 ][0]= mx;
4743                             mv_cache[ 8 ][1]= my;
4744                         }
4745                         mv_cache[ 0 ][0]= mx;
4746                         mv_cache[ 0 ][1]= my;
4747                     }
4748                 }else{
4749                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4750                     p[0] = p[1]=
4751                     p[8] = p[9]= 0;
4752                 }
4753             }
4754         }
4755     }else if(IS_DIRECT(mb_type)){
4756         pred_direct_motion(h, &mb_type);
4757         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4758     }else{
4759         int list, mx, my, i;
4760          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4761         if(IS_16X16(mb_type)){
4762             for(list=0; list<h->list_count; list++){
4763                     unsigned int val;
4764                     if(IS_DIR(mb_type, 0, list)){
4765                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4766                         if(val >= h->ref_count[list]){
4767                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4768                             return -1;
4769                         }
4770                     }else
4771                         val= LIST_NOT_USED&0xFF;
4772                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4773             }
4774             for(list=0; list<h->list_count; list++){
4775                 unsigned int val;
4776                 if(IS_DIR(mb_type, 0, list)){
4777                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4778                     mx += get_se_golomb(&s->gb);
4779                     my += get_se_golomb(&s->gb);
4780                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4781
4782                     val= pack16to32(mx,my);
4783                 }else
4784                     val=0;
4785                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4786             }
4787         }
4788         else if(IS_16X8(mb_type)){
4789             for(list=0; list<h->list_count; list++){
4790                     for(i=0; i<2; i++){
4791                         unsigned int val;
4792                         if(IS_DIR(mb_type, i, list)){
4793                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4794                             if(val >= h->ref_count[list]){
4795                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4796                                 return -1;
4797                             }
4798                         }else
4799                             val= LIST_NOT_USED&0xFF;
4800                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4801                     }
4802             }
4803             for(list=0; list<h->list_count; list++){
4804                 for(i=0; i<2; i++){
4805                     unsigned int val;
4806                     if(IS_DIR(mb_type, i, list)){
4807                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4808                         mx += get_se_golomb(&s->gb);
4809                         my += get_se_golomb(&s->gb);
4810                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4811
4812                         val= pack16to32(mx,my);
4813                     }else
4814                         val=0;
4815                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4816                 }
4817             }
4818         }else{
4819             assert(IS_8X16(mb_type));
4820             for(list=0; list<h->list_count; list++){
4821                     for(i=0; i<2; i++){
4822                         unsigned int val;
4823                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4824                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4825                             if(val >= h->ref_count[list]){
4826                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4827                                 return -1;
4828                             }
4829                         }else
4830                             val= LIST_NOT_USED&0xFF;
4831                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4832                     }
4833             }
4834             for(list=0; list<h->list_count; list++){
4835                 for(i=0; i<2; i++){
4836                     unsigned int val;
4837                     if(IS_DIR(mb_type, i, list)){
4838                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4839                         mx += get_se_golomb(&s->gb);
4840                         my += get_se_golomb(&s->gb);
4841                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4842
4843                         val= pack16to32(mx,my);
4844                     }else
4845                         val=0;
4846                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4847                 }
4848             }
4849         }
4850     }
4851
4852     if(IS_INTER(mb_type))
4853         write_back_motion(h, mb_type);
4854
4855     if(!IS_INTRA16x16(mb_type)){
4856         cbp= get_ue_golomb(&s->gb);
4857         if(cbp > 47){
4858             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4859             return -1;
4860         }
4861
4862         if(IS_INTRA4x4(mb_type))
4863             cbp= golomb_to_intra4x4_cbp[cbp];
4864         else
4865             cbp= golomb_to_inter_cbp[cbp];
4866     }
4867     h->cbp = cbp;
4868
4869     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4870         if(get_bits1(&s->gb))
4871             mb_type |= MB_TYPE_8x8DCT;
4872     }
4873     s->current_picture.mb_type[mb_xy]= mb_type;
4874
4875     if(cbp || IS_INTRA16x16(mb_type)){
4876         int i8x8, i4x4, chroma_idx;
4877         int dquant;
4878         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4879         const uint8_t *scan, *scan8x8, *dc_scan;
4880
4881 //        fill_non_zero_count_cache(h);
4882
4883         if(IS_INTERLACED(mb_type)){
4884             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4885             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4886             dc_scan= luma_dc_field_scan;
4887         }else{
4888             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4889             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4890             dc_scan= luma_dc_zigzag_scan;
4891         }
4892
4893         dquant= get_se_golomb(&s->gb);
4894
4895         if( dquant > 25 || dquant < -26 ){
4896             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4897             return -1;
4898         }
4899
4900         s->qscale += dquant;
4901         if(((unsigned)s->qscale) > 51){
4902             if(s->qscale<0) s->qscale+= 52;
4903             else            s->qscale-= 52;
4904         }
4905
4906         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4907         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4908         if(IS_INTRA16x16(mb_type)){
4909             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4910                 return -1; //FIXME continue if partitioned and other return -1 too
4911             }
4912
4913             assert((cbp&15) == 0 || (cbp&15) == 15);
4914
4915             if(cbp&15){
4916                 for(i8x8=0; i8x8<4; i8x8++){
4917                     for(i4x4=0; i4x4<4; i4x4++){
4918                         const int index= i4x4 + 4*i8x8;
4919                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4920                             return -1;
4921                         }
4922                     }
4923                 }
4924             }else{
4925                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4926             }
4927         }else{
4928             for(i8x8=0; i8x8<4; i8x8++){
4929                 if(cbp & (1<<i8x8)){
4930                     if(IS_8x8DCT(mb_type)){
4931                         DCTELEM *buf = &h->mb[64*i8x8];
4932                         uint8_t *nnz;
4933                         for(i4x4=0; i4x4<4; i4x4++){
4934                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4935                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4936                                 return -1;
4937                         }
4938                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4939                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4940                     }else{
4941                         for(i4x4=0; i4x4<4; i4x4++){
4942                             const int index= i4x4 + 4*i8x8;
4943
4944                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4945                                 return -1;
4946                             }
4947                         }
4948                     }
4949                 }else{
4950                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4951                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4952                 }
4953             }
4954         }
4955
4956         if(cbp&0x30){
4957             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4958                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4959                     return -1;
4960                 }
4961         }
4962
4963         if(cbp&0x20){
4964             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4965                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4966                 for(i4x4=0; i4x4<4; i4x4++){
4967                     const int index= 16 + 4*chroma_idx + i4x4;
4968                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4969                         return -1;
4970                     }
4971                 }
4972             }
4973         }else{
4974             uint8_t * const nnz= &h->non_zero_count_cache[0];
4975             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4976             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4977         }
4978     }else{
4979         uint8_t * const nnz= &h->non_zero_count_cache[0];
4980         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4981         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4982         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4983     }
4984     s->current_picture.qscale_table[mb_xy]= s->qscale;
4985     write_back_non_zero_count(h);
4986
4987     if(MB_MBAFF){
4988         h->ref_count[0] >>= 1;
4989         h->ref_count[1] >>= 1;
4990     }
4991
4992     return 0;
4993 }
4994
4995 static int decode_cabac_field_decoding_flag(H264Context *h) {
4996     MpegEncContext * const s = &h->s;
4997     const int mb_x = s->mb_x;
4998     const int mb_y = s->mb_y & ~1;
4999     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5000     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5001
5002     unsigned int ctx = 0;
5003
5004     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5005         ctx += 1;
5006     }
5007     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5008         ctx += 1;
5009     }
5010
5011     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5012 }
5013
5014 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5015     uint8_t *state= &h->cabac_state[ctx_base];
5016     int mb_type;
5017
5018     if(intra_slice){
5019         MpegEncContext * const s = &h->s;
5020         const int mba_xy = h->left_mb_xy[0];
5021         const int mbb_xy = h->top_mb_xy;
5022         int ctx=0;
5023         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5024             ctx++;
5025         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5026             ctx++;
5027         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5028             return 0;   /* I4x4 */
5029         state += 2;
5030     }else{
5031         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5032             return 0;   /* I4x4 */
5033     }
5034
5035     if( get_cabac_terminate( &h->cabac ) )
5036         return 25;  /* PCM */
5037
5038     mb_type = 1; /* I16x16 */
5039     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5040     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5041         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5042     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5043     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5044     return mb_type;
5045 }
5046
5047 static int decode_cabac_mb_type( H264Context *h ) {
5048     MpegEncContext * const s = &h->s;
5049
5050     if( h->slice_type == FF_I_TYPE ) {
5051         return decode_cabac_intra_mb_type(h, 3, 1);
5052     } else if( h->slice_type == FF_P_TYPE ) {
5053         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5054             /* P-type */
5055             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5056                 /* P_L0_D16x16, P_8x8 */
5057                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5058             } else {
5059                 /* P_L0_D8x16, P_L0_D16x8 */
5060                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5061             }
5062         } else {
5063             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5064         }
5065     } else if( h->slice_type == FF_B_TYPE ) {
5066         const int mba_xy = h->left_mb_xy[0];
5067         const int mbb_xy = h->top_mb_xy;
5068         int ctx = 0;
5069         int bits;
5070
5071         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5072             ctx++;
5073         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5074             ctx++;
5075
5076         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5077             return 0; /* B_Direct_16x16 */
5078
5079         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5080             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5081         }
5082
5083         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5084         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5085         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5086         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5087         if( bits < 8 )
5088             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5089         else if( bits == 13 ) {
5090             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5091         } else if( bits == 14 )
5092             return 11; /* B_L1_L0_8x16 */
5093         else if( bits == 15 )
5094             return 22; /* B_8x8 */
5095
5096         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5097         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5098     } else {
5099         /* TODO SI/SP frames? */
5100         return -1;
5101     }
5102 }
5103
5104 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5105     MpegEncContext * const s = &h->s;
5106     int mba_xy, mbb_xy;
5107     int ctx = 0;
5108
5109     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5110         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5111         mba_xy = mb_xy - 1;
5112         if( (mb_y&1)
5113             && h->slice_table[mba_xy] == h->slice_num
5114             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5115             mba_xy += s->mb_stride;
5116         if( MB_FIELD ){
5117             mbb_xy = mb_xy - s->mb_stride;
5118             if( !(mb_y&1)
5119                 && h->slice_table[mbb_xy] == h->slice_num
5120                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5121                 mbb_xy -= s->mb_stride;
5122         }else
5123             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5124     }else{
5125         int mb_xy = h->mb_xy;
5126         mba_xy = mb_xy - 1;
5127         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5128     }
5129
5130     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5131         ctx++;
5132     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5133         ctx++;
5134
5135     if( h->slice_type == FF_B_TYPE )
5136         ctx += 13;
5137     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5138 }
5139
5140 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5141     int mode = 0;
5142
5143     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5144         return pred_mode;
5145
5146     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5147     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5148     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5149
5150     if( mode >= pred_mode )
5151         return mode + 1;
5152     else
5153         return mode;
5154 }
5155
5156 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5157     const int mba_xy = h->left_mb_xy[0];
5158     const int mbb_xy = h->top_mb_xy;
5159
5160     int ctx = 0;
5161
5162     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5163     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5164         ctx++;
5165
5166     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5167         ctx++;
5168
5169     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5170         return 0;
5171
5172     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5173         return 1;
5174     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5175         return 2;
5176     else
5177         return 3;
5178 }
5179
5180 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5181     int cbp_b, cbp_a, ctx, cbp = 0;
5182
5183     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5184     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5185
5186     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5187     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5188     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5189     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5190     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5191     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5192     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5193     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5194     return cbp;
5195 }
5196 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5197     int ctx;
5198     int cbp_a, cbp_b;
5199
5200     cbp_a = (h->left_cbp>>4)&0x03;
5201     cbp_b = (h-> top_cbp>>4)&0x03;
5202
5203     ctx = 0;
5204     if( cbp_a > 0 ) ctx++;
5205     if( cbp_b > 0 ) ctx += 2;
5206     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5207         return 0;
5208
5209     ctx = 4;
5210     if( cbp_a == 2 ) ctx++;
5211     if( cbp_b == 2 ) ctx += 2;
5212     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5213 }
5214 static int decode_cabac_mb_dqp( H264Context *h) {
5215     int   ctx = 0;
5216     int   val = 0;
5217
5218     if( h->last_qscale_diff != 0 )
5219         ctx++;
5220
5221     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5222         if( ctx < 2 )
5223             ctx = 2;
5224         else
5225             ctx = 3;
5226         val++;
5227         if(val > 102) //prevent infinite loop
5228             return INT_MIN;
5229     }
5230
5231     if( val&0x01 )
5232         return (val + 1)/2;
5233     else
5234         return -(val + 1)/2;
5235 }
5236 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5237     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5238         return 0;   /* 8x8 */
5239     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5240         return 1;   /* 8x4 */
5241     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5242         return 2;   /* 4x8 */
5243     return 3;       /* 4x4 */
5244 }
5245 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5246     int type;
5247     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5248         return 0;   /* B_Direct_8x8 */
5249     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5250         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5251     type = 3;
5252     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5253         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5254             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5255         type += 4;
5256     }
5257     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5258     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5259     return type;
5260 }
5261
5262 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5263     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5264 }
5265
5266 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5267     int refa = h->ref_cache[list][scan8[n] - 1];
5268     int refb = h->ref_cache[list][scan8[n] - 8];
5269     int ref  = 0;
5270     int ctx  = 0;
5271
5272     if( h->slice_type == FF_B_TYPE) {
5273         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5274             ctx++;
5275         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5276             ctx += 2;
5277     } else {
5278         if( refa > 0 )
5279             ctx++;
5280         if( refb > 0 )
5281             ctx += 2;
5282     }
5283
5284     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5285         ref++;
5286         if( ctx < 4 )
5287             ctx = 4;
5288         else
5289             ctx = 5;
5290         if(ref >= 32 /*h->ref_list[list]*/){
5291             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5292             return 0; //FIXME we should return -1 and check the return everywhere
5293         }
5294     }
5295     return ref;
5296 }
5297
5298 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5299     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5300                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5301     int ctxbase = (l == 0) ? 40 : 47;
5302     int ctx, mvd;
5303
5304     if( amvd < 3 )
5305         ctx = 0;
5306     else if( amvd > 32 )
5307         ctx = 2;
5308     else
5309         ctx = 1;
5310
5311     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5312         return 0;
5313
5314     mvd= 1;
5315     ctx= 3;
5316     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5317         mvd++;
5318         if( ctx < 6 )
5319             ctx++;
5320     }
5321
5322     if( mvd >= 9 ) {
5323         int k = 3;
5324         while( get_cabac_bypass( &h->cabac ) ) {
5325             mvd += 1 << k;
5326             k++;
5327             if(k>24){
5328                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5329                 return INT_MIN;
5330             }
5331         }
5332         while( k-- ) {
5333             if( get_cabac_bypass( &h->cabac ) )
5334                 mvd += 1 << k;
5335         }
5336     }
5337     return get_cabac_bypass_sign( &h->cabac, -mvd );
5338 }
5339
5340 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5341     int nza, nzb;
5342     int ctx = 0;
5343
5344     if( is_dc ) {
5345     if( cat == 0 ) {
5346         nza = h->left_cbp&0x100;
5347         nzb = h-> top_cbp&0x100;
5348     } else {
5349         nza = (h->left_cbp>>(6+idx))&0x01;
5350         nzb = (h-> top_cbp>>(6+idx))&0x01;
5351     }
5352     } else {
5353     if( cat == 1 || cat == 2 ) {
5354         nza = h->non_zero_count_cache[scan8[idx] - 1];
5355         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5356     } else {
5357         assert(cat == 4);
5358         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5359         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5360     }
5361     }
5362
5363     if( nza > 0 )
5364         ctx++;
5365
5366     if( nzb > 0 )
5367         ctx += 2;
5368
5369     return ctx + 4 * cat;
5370 }
5371
5372 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5373     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5374     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5375     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5376     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5377 };
5378
5379 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5380     static const int significant_coeff_flag_offset[2][6] = {
5381       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5382       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5383     };
5384     static const int last_coeff_flag_offset[2][6] = {
5385       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5386       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5387     };
5388     static const int coeff_abs_level_m1_offset[6] = {
5389         227+0, 227+10, 227+20, 227+30, 227+39, 426
5390     };
5391     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5392       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5393         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5394         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5395        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5396       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5397         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5398         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5399         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5400     };
5401     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5402      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5403      * map node ctx => cabac ctx for level=1 */
5404     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5405     /* map node ctx => cabac ctx for level>1 */
5406     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5407     static const uint8_t coeff_abs_level_transition[2][8] = {
5408     /* update node ctx after decoding a level=1 */
5409         { 1, 2, 3, 3, 4, 5, 6, 7 },
5410     /* update node ctx after decoding a level>1 */
5411         { 4, 4, 4, 4, 5, 6, 7, 7 }
5412     };
5413
5414     int index[64];
5415
5416     int av_unused last;
5417     int coeff_count = 0;
5418     int node_ctx = 0;
5419
5420     uint8_t *significant_coeff_ctx_base;
5421     uint8_t *last_coeff_ctx_base;
5422     uint8_t *abs_level_m1_ctx_base;
5423
5424 #ifndef ARCH_X86
5425 #define CABAC_ON_STACK
5426 #endif
5427 #ifdef CABAC_ON_STACK
5428 #define CC &cc
5429     CABACContext cc;
5430     cc.range     = h->cabac.range;
5431     cc.low       = h->cabac.low;
5432     cc.bytestream= h->cabac.bytestream;
5433 #else
5434 #define CC &h->cabac
5435 #endif
5436
5437
5438     /* cat: 0-> DC 16x16  n = 0
5439      *      1-> AC 16x16  n = luma4x4idx
5440      *      2-> Luma4x4   n = luma4x4idx
5441      *      3-> DC Chroma n = iCbCr
5442      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5443      *      5-> Luma8x8   n = 4 * luma8x8idx
5444      */
5445
5446     /* read coded block flag */
5447     if( is_dc || cat != 5 ) {
5448         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5449             if( !is_dc ) {
5450             if( cat == 1 || cat == 2 )
5451                 h->non_zero_count_cache[scan8[n]] = 0;
5452             else
5453                 h->non_zero_count_cache[scan8[16+n]] = 0;
5454             }
5455
5456 #ifdef CABAC_ON_STACK
5457             h->cabac.range     = cc.range     ;
5458             h->cabac.low       = cc.low       ;
5459             h->cabac.bytestream= cc.bytestream;
5460 #endif
5461             return;
5462         }
5463     }
5464
5465     significant_coeff_ctx_base = h->cabac_state
5466         + significant_coeff_flag_offset[MB_FIELD][cat];
5467     last_coeff_ctx_base = h->cabac_state
5468         + last_coeff_flag_offset[MB_FIELD][cat];
5469     abs_level_m1_ctx_base = h->cabac_state
5470         + coeff_abs_level_m1_offset[cat];
5471
5472     if( !is_dc && cat == 5 ) {
5473 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5474         for(last= 0; last < coefs; last++) { \
5475             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5476             if( get_cabac( CC, sig_ctx )) { \
5477                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5478                 index[coeff_count++] = last; \
5479                 if( get_cabac( CC, last_ctx ) ) { \
5480                     last= max_coeff; \
5481                     break; \
5482                 } \
5483             } \
5484         }\
5485         if( last == max_coeff -1 ) {\
5486             index[coeff_count++] = last;\
5487         }
5488         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5489 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5490         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5491     } else {
5492         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5493 #else
5494         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5495     } else {
5496         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5497 #endif
5498     }
5499     assert(coeff_count > 0);
5500
5501     if( is_dc ) {
5502     if( cat == 0 )
5503         h->cbp_table[h->mb_xy] |= 0x100;
5504     else
5505         h->cbp_table[h->mb_xy] |= 0x40 << n;
5506     } else {
5507     if( cat == 1 || cat == 2 )
5508         h->non_zero_count_cache[scan8[n]] = coeff_count;
5509     else if( cat == 4 )
5510         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5511     else {
5512         assert( cat == 5 );
5513         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5514     }
5515     }
5516
5517     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5518         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5519
5520         int j= scantable[index[coeff_count]];
5521
5522         if( get_cabac( CC, ctx ) == 0 ) {
5523             node_ctx = coeff_abs_level_transition[0][node_ctx];
5524             if( is_dc ) {
5525                 block[j] = get_cabac_bypass_sign( CC, -1);
5526             }else{
5527                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5528             }
5529         } else {
5530             int coeff_abs = 2;
5531             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5532             node_ctx = coeff_abs_level_transition[1][node_ctx];
5533
5534             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5535                 coeff_abs++;
5536             }
5537
5538             if( coeff_abs >= 15 ) {
5539                 int j = 0;
5540                 while( get_cabac_bypass( CC ) ) {
5541                     j++;
5542                 }
5543
5544                 coeff_abs=1;
5545                 while( j-- ) {
5546                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5547                 }
5548                 coeff_abs+= 14;
5549             }
5550
5551             if( is_dc ) {
5552                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5553                 else                                block[j] =  coeff_abs;
5554             }else{
5555                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5556                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5557             }
5558         }
5559     }
5560 #ifdef CABAC_ON_STACK
5561             h->cabac.range     = cc.range     ;
5562             h->cabac.low       = cc.low       ;
5563             h->cabac.bytestream= cc.bytestream;
5564 #endif
5565
5566 }
5567
5568 #ifndef CONFIG_SMALL
5569 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5570     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5571 }
5572
5573 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5574     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5575 }
5576 #endif
5577
5578 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5579 #ifdef CONFIG_SMALL
5580     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5581 #else
5582     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5583     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5584 #endif
5585 }
5586
5587 static inline void compute_mb_neighbors(H264Context *h)
5588 {
5589     MpegEncContext * const s = &h->s;
5590     const int mb_xy  = h->mb_xy;
5591     h->top_mb_xy     = mb_xy - s->mb_stride;
5592     h->left_mb_xy[0] = mb_xy - 1;
5593     if(FRAME_MBAFF){
5594         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5595         const int top_pair_xy      = pair_xy     - s->mb_stride;
5596         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5597         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5598         const int curr_mb_frame_flag = !MB_FIELD;
5599         const int bottom = (s->mb_y & 1);
5600         if (bottom
5601                 ? !curr_mb_frame_flag // bottom macroblock
5602                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5603                 ) {
5604             h->top_mb_xy -= s->mb_stride;
5605         }
5606         if (left_mb_frame_flag != curr_mb_frame_flag) {
5607             h->left_mb_xy[0] = pair_xy - 1;
5608         }
5609     } else if (FIELD_PICTURE) {
5610         h->top_mb_xy -= s->mb_stride;
5611     }
5612     return;
5613 }
5614
5615 /**
5616  * decodes a macroblock
5617  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5618  */
5619 static int decode_mb_cabac(H264Context *h) {
5620     MpegEncContext * const s = &h->s;
5621     int mb_xy;
5622     int mb_type, partition_count, cbp = 0;
5623     int dct8x8_allowed= h->pps.transform_8x8_mode;
5624
5625     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5626
5627     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5628
5629     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5630     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE ) {
5631         int skip;
5632         /* a skipped mb needs the aff flag from the following mb */
5633         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5634             predict_field_decoding_flag(h);
5635         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5636             skip = h->next_mb_skipped;
5637         else
5638             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5639         /* read skip flags */
5640         if( skip ) {
5641             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5642                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5643                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5644                 if(h->next_mb_skipped)
5645                     predict_field_decoding_flag(h);
5646                 else
5647                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5648             }
5649
5650             decode_mb_skip(h);
5651
5652             h->cbp_table[mb_xy] = 0;
5653             h->chroma_pred_mode_table[mb_xy] = 0;
5654             h->last_qscale_diff = 0;
5655
5656             return 0;
5657
5658         }
5659     }
5660     if(FRAME_MBAFF){
5661         if( (s->mb_y&1) == 0 )
5662             h->mb_mbaff =
5663             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5664     }else
5665         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5666
5667     h->prev_mb_skipped = 0;
5668
5669     compute_mb_neighbors(h);
5670     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5671         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5672         return -1;
5673     }
5674
5675     if( h->slice_type == FF_B_TYPE ) {
5676         if( mb_type < 23 ){
5677             partition_count= b_mb_type_info[mb_type].partition_count;
5678             mb_type=         b_mb_type_info[mb_type].type;
5679         }else{
5680             mb_type -= 23;
5681             goto decode_intra_mb;
5682         }
5683     } else if( h->slice_type == FF_P_TYPE ) {
5684         if( mb_type < 5) {
5685             partition_count= p_mb_type_info[mb_type].partition_count;
5686             mb_type=         p_mb_type_info[mb_type].type;
5687         } else {
5688             mb_type -= 5;
5689             goto decode_intra_mb;
5690         }
5691     } else {
5692        assert(h->slice_type == FF_I_TYPE);
5693 decode_intra_mb:
5694         partition_count = 0;
5695         cbp= i_mb_type_info[mb_type].cbp;
5696         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5697         mb_type= i_mb_type_info[mb_type].type;
5698     }
5699     if(MB_FIELD)
5700         mb_type |= MB_TYPE_INTERLACED;
5701
5702     h->slice_table[ mb_xy ]= h->slice_num;
5703
5704     if(IS_INTRA_PCM(mb_type)) {
5705         const uint8_t *ptr;
5706         unsigned int x, y;
5707
5708         // We assume these blocks are very rare so we do not optimize it.
5709         // FIXME The two following lines get the bitstream position in the cabac
5710         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5711         ptr= h->cabac.bytestream;
5712         if(h->cabac.low&0x1) ptr--;
5713         if(CABAC_BITS==16){
5714             if(h->cabac.low&0x1FF) ptr--;
5715         }
5716
5717         // The pixels are stored in the same order as levels in h->mb array.
5718         for(y=0; y<16; y++){
5719             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5720             for(x=0; x<16; x++){
5721                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5722                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5723             }
5724         }
5725         for(y=0; y<8; y++){
5726             const int index= 256 + 4*(y&3) + 32*(y>>2);
5727             for(x=0; x<8; x++){
5728                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5729                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5730             }
5731         }
5732         for(y=0; y<8; y++){
5733             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5734             for(x=0; x<8; x++){
5735                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5736                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5737             }
5738         }
5739
5740         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5741
5742         // All blocks are present
5743         h->cbp_table[mb_xy] = 0x1ef;
5744         h->chroma_pred_mode_table[mb_xy] = 0;
5745         // In deblocking, the quantizer is 0
5746         s->current_picture.qscale_table[mb_xy]= 0;
5747         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5748         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5749         // All coeffs are present
5750         memset(h->non_zero_count[mb_xy], 16, 16);
5751         s->current_picture.mb_type[mb_xy]= mb_type;
5752         return 0;
5753     }
5754
5755     if(MB_MBAFF){
5756         h->ref_count[0] <<= 1;
5757         h->ref_count[1] <<= 1;
5758     }
5759
5760     fill_caches(h, mb_type, 0);
5761
5762     if( IS_INTRA( mb_type ) ) {
5763         int i, pred_mode;
5764         if( IS_INTRA4x4( mb_type ) ) {
5765             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5766                 mb_type |= MB_TYPE_8x8DCT;
5767                 for( i = 0; i < 16; i+=4 ) {
5768                     int pred = pred_intra_mode( h, i );
5769                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5770                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5771                 }
5772             } else {
5773                 for( i = 0; i < 16; i++ ) {
5774                     int pred = pred_intra_mode( h, i );
5775                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5776
5777                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5778                 }
5779             }
5780             write_back_intra_pred_mode(h);
5781             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5782         } else {
5783             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5784             if( h->intra16x16_pred_mode < 0 ) return -1;
5785         }
5786         h->chroma_pred_mode_table[mb_xy] =
5787         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5788
5789         pred_mode= check_intra_pred_mode( h, pred_mode );
5790         if( pred_mode < 0 ) return -1;
5791         h->chroma_pred_mode= pred_mode;
5792     } else if( partition_count == 4 ) {
5793         int i, j, sub_partition_count[4], list, ref[2][4];
5794
5795         if( h->slice_type == FF_B_TYPE ) {
5796             for( i = 0; i < 4; i++ ) {
5797                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5798                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5799                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5800             }
5801             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5802                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5803                 pred_direct_motion(h, &mb_type);
5804                 h->ref_cache[0][scan8[4]] =
5805                 h->ref_cache[1][scan8[4]] =
5806                 h->ref_cache[0][scan8[12]] =
5807                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5808                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5809                     for( i = 0; i < 4; i++ )
5810                         if( IS_DIRECT(h->sub_mb_type[i]) )
5811                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5812                 }
5813             }
5814         } else {
5815             for( i = 0; i < 4; i++ ) {
5816                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5817                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5818                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5819             }
5820         }
5821
5822         for( list = 0; list < h->list_count; list++ ) {
5823                 for( i = 0; i < 4; i++ ) {
5824                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5825                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5826                         if( h->ref_count[list] > 1 )
5827                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5828                         else
5829                             ref[list][i] = 0;
5830                     } else {
5831                         ref[list][i] = -1;
5832                     }
5833                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5834                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5835                 }
5836         }
5837
5838         if(dct8x8_allowed)
5839             dct8x8_allowed = get_dct8x8_allowed(h);
5840
5841         for(list=0; list<h->list_count; list++){
5842             for(i=0; i<4; i++){
5843                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5844                 if(IS_DIRECT(h->sub_mb_type[i])){
5845                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5846                     continue;
5847                 }
5848
5849                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5850                     const int sub_mb_type= h->sub_mb_type[i];
5851                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5852                     for(j=0; j<sub_partition_count[i]; j++){
5853                         int mpx, mpy;
5854                         int mx, my;
5855                         const int index= 4*i + block_width*j;
5856                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5857                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5858                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5859
5860                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5861                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5862                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5863
5864                         if(IS_SUB_8X8(sub_mb_type)){
5865                             mv_cache[ 1 ][0]=
5866                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5867                             mv_cache[ 1 ][1]=
5868                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5869
5870                             mvd_cache[ 1 ][0]=
5871                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5872                             mvd_cache[ 1 ][1]=
5873                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5874                         }else if(IS_SUB_8X4(sub_mb_type)){
5875                             mv_cache[ 1 ][0]= mx;
5876                             mv_cache[ 1 ][1]= my;
5877
5878                             mvd_cache[ 1 ][0]= mx - mpx;
5879                             mvd_cache[ 1 ][1]= my - mpy;
5880                         }else if(IS_SUB_4X8(sub_mb_type)){
5881                             mv_cache[ 8 ][0]= mx;
5882                             mv_cache[ 8 ][1]= my;
5883
5884                             mvd_cache[ 8 ][0]= mx - mpx;
5885                             mvd_cache[ 8 ][1]= my - mpy;
5886                         }
5887                         mv_cache[ 0 ][0]= mx;
5888                         mv_cache[ 0 ][1]= my;
5889
5890                         mvd_cache[ 0 ][0]= mx - mpx;
5891                         mvd_cache[ 0 ][1]= my - mpy;
5892                     }
5893                 }else{
5894                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5895                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5896                     p[0] = p[1] = p[8] = p[9] = 0;
5897                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5898                 }
5899             }
5900         }
5901     } else if( IS_DIRECT(mb_type) ) {
5902         pred_direct_motion(h, &mb_type);
5903         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5904         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5905         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5906     } else {
5907         int list, mx, my, i, mpx, mpy;
5908         if(IS_16X16(mb_type)){
5909             for(list=0; list<h->list_count; list++){
5910                 if(IS_DIR(mb_type, 0, list)){
5911                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5912                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5913                 }else
5914                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5915             }
5916             for(list=0; list<h->list_count; list++){
5917                 if(IS_DIR(mb_type, 0, list)){
5918                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5919
5920                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5921                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5922                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5923
5924                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5925                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5926                 }else
5927                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5928             }
5929         }
5930         else if(IS_16X8(mb_type)){
5931             for(list=0; list<h->list_count; list++){
5932                     for(i=0; i<2; i++){
5933                         if(IS_DIR(mb_type, i, list)){
5934                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5935                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5936                         }else
5937                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5938                     }
5939             }
5940             for(list=0; list<h->list_count; list++){
5941                 for(i=0; i<2; i++){
5942                     if(IS_DIR(mb_type, i, list)){
5943                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5944                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5945                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5946                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5947
5948                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5949                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5950                     }else{
5951                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5952                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5953                     }
5954                 }
5955             }
5956         }else{
5957             assert(IS_8X16(mb_type));
5958             for(list=0; list<h->list_count; list++){
5959                     for(i=0; i<2; i++){
5960                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5961                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5962                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5963                         }else
5964                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5965                     }
5966             }
5967             for(list=0; list<h->list_count; list++){
5968                 for(i=0; i<2; i++){
5969                     if(IS_DIR(mb_type, i, list)){
5970                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5971                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5972                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5973
5974                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5975                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5976                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5977                     }else{
5978                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5979                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5980                     }
5981                 }
5982             }
5983         }
5984     }
5985
5986    if( IS_INTER( mb_type ) ) {
5987         h->chroma_pred_mode_table[mb_xy] = 0;
5988         write_back_motion( h, mb_type );
5989    }
5990
5991     if( !IS_INTRA16x16( mb_type ) ) {
5992         cbp  = decode_cabac_mb_cbp_luma( h );
5993         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5994     }
5995
5996     h->cbp_table[mb_xy] = h->cbp = cbp;
5997
5998     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5999         if( decode_cabac_mb_transform_size( h ) )
6000             mb_type |= MB_TYPE_8x8DCT;
6001     }
6002     s->current_picture.mb_type[mb_xy]= mb_type;
6003
6004     if( cbp || IS_INTRA16x16( mb_type ) ) {
6005         const uint8_t *scan, *scan8x8, *dc_scan;
6006         const uint32_t *qmul;
6007         int dqp;
6008
6009         if(IS_INTERLACED(mb_type)){
6010             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6011             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6012             dc_scan= luma_dc_field_scan;
6013         }else{
6014             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6015             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6016             dc_scan= luma_dc_zigzag_scan;
6017         }
6018
6019         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6020         if( dqp == INT_MIN ){
6021             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6022             return -1;
6023         }
6024         s->qscale += dqp;
6025         if(((unsigned)s->qscale) > 51){
6026             if(s->qscale<0) s->qscale+= 52;
6027             else            s->qscale-= 52;
6028         }
6029         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6030         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6031
6032         if( IS_INTRA16x16( mb_type ) ) {
6033             int i;
6034             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6035             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6036
6037             if( cbp&15 ) {
6038                 qmul = h->dequant4_coeff[0][s->qscale];
6039                 for( i = 0; i < 16; i++ ) {
6040                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6041                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6042                 }
6043             } else {
6044                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6045             }
6046         } else {
6047             int i8x8, i4x4;
6048             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6049                 if( cbp & (1<<i8x8) ) {
6050                     if( IS_8x8DCT(mb_type) ) {
6051                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6052                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6053                     } else {
6054                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6055                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6056                             const int index = 4*i8x8 + i4x4;
6057                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6058 //START_TIMER
6059                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6060 //STOP_TIMER("decode_residual")
6061                         }
6062                     }
6063                 } else {
6064                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6065                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6066                 }
6067             }
6068         }
6069
6070         if( cbp&0x30 ){
6071             int c;
6072             for( c = 0; c < 2; c++ ) {
6073                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6074                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6075             }
6076         }
6077
6078         if( cbp&0x20 ) {
6079             int c, i;
6080             for( c = 0; c < 2; c++ ) {
6081                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6082                 for( i = 0; i < 4; i++ ) {
6083                     const int index = 16 + 4 * c + i;
6084                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6085                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6086                 }
6087             }
6088         } else {
6089             uint8_t * const nnz= &h->non_zero_count_cache[0];
6090             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6091             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6092         }
6093     } else {
6094         uint8_t * const nnz= &h->non_zero_count_cache[0];
6095         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6096         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6097         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6098         h->last_qscale_diff = 0;
6099     }
6100
6101     s->current_picture.qscale_table[mb_xy]= s->qscale;
6102     write_back_non_zero_count(h);
6103
6104     if(MB_MBAFF){
6105         h->ref_count[0] >>= 1;
6106         h->ref_count[1] >>= 1;
6107     }
6108
6109     return 0;
6110 }
6111
6112
6113 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6114     int i, d;
6115     const int index_a = qp + h->slice_alpha_c0_offset;
6116     const int alpha = (alpha_table+52)[index_a];
6117     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6118
6119     if( bS[0] < 4 ) {
6120         int8_t tc[4];
6121         for(i=0; i<4; i++)
6122             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6123         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6124     } else {
6125         /* 16px edge length, because bS=4 is triggered by being at
6126          * the edge of an intra MB, so all 4 bS are the same */
6127             for( d = 0; d < 16; d++ ) {
6128                 const int p0 = pix[-1];
6129                 const int p1 = pix[-2];
6130                 const int p2 = pix[-3];
6131
6132                 const int q0 = pix[0];
6133                 const int q1 = pix[1];
6134                 const int q2 = pix[2];
6135
6136                 if( FFABS( p0 - q0 ) < alpha &&
6137                     FFABS( p1 - p0 ) < beta &&
6138                     FFABS( q1 - q0 ) < beta ) {
6139
6140                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6141                         if( FFABS( p2 - p0 ) < beta)
6142                         {
6143                             const int p3 = pix[-4];
6144                             /* p0', p1', p2' */
6145                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6146                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6147                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6148                         } else {
6149                             /* p0' */
6150                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6151                         }
6152                         if( FFABS( q2 - q0 ) < beta)
6153                         {
6154                             const int q3 = pix[3];
6155                             /* q0', q1', q2' */
6156                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6157                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6158                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6159                         } else {
6160                             /* q0' */
6161                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6162                         }
6163                     }else{
6164                         /* p0', q0' */
6165                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6166                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6167                     }
6168                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6169                 }
6170                 pix += stride;
6171             }
6172     }
6173 }
6174 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6175     int i;
6176     const int index_a = qp + h->slice_alpha_c0_offset;
6177     const int alpha = (alpha_table+52)[index_a];
6178     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6179
6180     if( bS[0] < 4 ) {
6181         int8_t tc[4];
6182         for(i=0; i<4; i++)
6183             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6184         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6185     } else {
6186         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6187     }
6188 }
6189
6190 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6191     int i;
6192     for( i = 0; i < 16; i++, pix += stride) {
6193         int index_a;
6194         int alpha;
6195         int beta;
6196
6197         int qp_index;
6198         int bS_index = (i >> 1);
6199         if (!MB_FIELD) {
6200             bS_index &= ~1;
6201             bS_index |= (i & 1);
6202         }
6203
6204         if( bS[bS_index] == 0 ) {
6205             continue;
6206         }
6207
6208         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6209         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6210         alpha = (alpha_table+52)[index_a];
6211         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6212
6213         if( bS[bS_index] < 4 ) {
6214             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6215             const int p0 = pix[-1];
6216             const int p1 = pix[-2];
6217             const int p2 = pix[-3];
6218             const int q0 = pix[0];
6219             const int q1 = pix[1];
6220             const int q2 = pix[2];
6221
6222             if( FFABS( p0 - q0 ) < alpha &&
6223                 FFABS( p1 - p0 ) < beta &&
6224                 FFABS( q1 - q0 ) < beta ) {
6225                 int tc = tc0;
6226                 int i_delta;
6227
6228                 if( FFABS( p2 - p0 ) < beta ) {
6229                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6230                     tc++;
6231                 }
6232                 if( FFABS( q2 - q0 ) < beta ) {
6233                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6234                     tc++;
6235                 }
6236
6237                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6238                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6239                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6240                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6241             }
6242         }else{
6243             const int p0 = pix[-1];
6244             const int p1 = pix[-2];
6245             const int p2 = pix[-3];
6246
6247             const int q0 = pix[0];
6248             const int q1 = pix[1];
6249             const int q2 = pix[2];
6250
6251             if( FFABS( p0 - q0 ) < alpha &&
6252                 FFABS( p1 - p0 ) < beta &&
6253                 FFABS( q1 - q0 ) < beta ) {
6254
6255                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6256                     if( FFABS( p2 - p0 ) < beta)
6257                     {
6258                         const int p3 = pix[-4];
6259                         /* p0', p1', p2' */
6260                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6261                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6262                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6263                     } else {
6264                         /* p0' */
6265                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6266                     }
6267                     if( FFABS( q2 - q0 ) < beta)
6268                     {
6269                         const int q3 = pix[3];
6270                         /* q0', q1', q2' */
6271                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6272                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6273                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6274                     } else {
6275                         /* q0' */
6276                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6277                     }
6278                 }else{
6279                     /* p0', q0' */
6280                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6281                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6282                 }
6283                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6284             }
6285         }
6286     }
6287 }
6288 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6289     int i;
6290     for( i = 0; i < 8; i++, pix += stride) {
6291         int index_a;
6292         int alpha;
6293         int beta;
6294
6295         int qp_index;
6296         int bS_index = i;
6297
6298         if( bS[bS_index] == 0 ) {
6299             continue;
6300         }
6301
6302         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6303         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6304         alpha = (alpha_table+52)[index_a];
6305         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6306
6307         if( bS[bS_index] < 4 ) {
6308             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6309             const int p0 = pix[-1];
6310             const int p1 = pix[-2];
6311             const int q0 = pix[0];
6312             const int q1 = pix[1];
6313
6314             if( FFABS( p0 - q0 ) < alpha &&
6315                 FFABS( p1 - p0 ) < beta &&
6316                 FFABS( q1 - q0 ) < beta ) {
6317                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6318
6319                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6320                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6321                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6322             }
6323         }else{
6324             const int p0 = pix[-1];
6325             const int p1 = pix[-2];
6326             const int q0 = pix[0];
6327             const int q1 = pix[1];
6328
6329             if( FFABS( p0 - q0 ) < alpha &&
6330                 FFABS( p1 - p0 ) < beta &&
6331                 FFABS( q1 - q0 ) < beta ) {
6332
6333                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6334                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6335                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6336             }
6337         }
6338     }
6339 }
6340
6341 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6342     int i, d;
6343     const int index_a = qp + h->slice_alpha_c0_offset;
6344     const int alpha = (alpha_table+52)[index_a];
6345     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6346     const int pix_next  = stride;
6347
6348     if( bS[0] < 4 ) {
6349         int8_t tc[4];
6350         for(i=0; i<4; i++)
6351             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6352         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6353     } else {
6354         /* 16px edge length, see filter_mb_edgev */
6355             for( d = 0; d < 16; d++ ) {
6356                 const int p0 = pix[-1*pix_next];
6357                 const int p1 = pix[-2*pix_next];
6358                 const int p2 = pix[-3*pix_next];
6359                 const int q0 = pix[0];
6360                 const int q1 = pix[1*pix_next];
6361                 const int q2 = pix[2*pix_next];
6362
6363                 if( FFABS( p0 - q0 ) < alpha &&
6364                     FFABS( p1 - p0 ) < beta &&
6365                     FFABS( q1 - q0 ) < beta ) {
6366
6367                     const int p3 = pix[-4*pix_next];
6368                     const int q3 = pix[ 3*pix_next];
6369
6370                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6371                         if( FFABS( p2 - p0 ) < beta) {
6372                             /* p0', p1', p2' */
6373                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6374                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6375                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6376                         } else {
6377                             /* p0' */
6378                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6379                         }
6380                         if( FFABS( q2 - q0 ) < beta) {
6381                             /* q0', q1', q2' */
6382                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6383                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6384                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6385                         } else {
6386                             /* q0' */
6387                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6388                         }
6389                     }else{
6390                         /* p0', q0' */
6391                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6392                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6393                     }
6394                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6395                 }
6396                 pix++;
6397             }
6398     }
6399 }
6400
6401 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6402     int i;
6403     const int index_a = qp + h->slice_alpha_c0_offset;
6404     const int alpha = (alpha_table+52)[index_a];
6405     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6406
6407     if( bS[0] < 4 ) {
6408         int8_t tc[4];
6409         for(i=0; i<4; i++)
6410             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6411         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6412     } else {
6413         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6414     }
6415 }
6416
6417 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6418     MpegEncContext * const s = &h->s;
6419     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6420     int mb_xy, mb_type;
6421     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6422
6423     mb_xy = h->mb_xy;
6424
6425     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6426        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6427                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6428         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6429         return;
6430     }
6431     assert(!FRAME_MBAFF);
6432
6433     mb_type = s->current_picture.mb_type[mb_xy];
6434     qp = s->current_picture.qscale_table[mb_xy];
6435     qp0 = s->current_picture.qscale_table[mb_xy-1];
6436     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6437     qpc = get_chroma_qp( h, 0, qp );
6438     qpc0 = get_chroma_qp( h, 0, qp0 );
6439     qpc1 = get_chroma_qp( h, 0, qp1 );
6440     qp0 = (qp + qp0 + 1) >> 1;
6441     qp1 = (qp + qp1 + 1) >> 1;
6442     qpc0 = (qpc + qpc0 + 1) >> 1;
6443     qpc1 = (qpc + qpc1 + 1) >> 1;
6444     qp_thresh = 15 - h->slice_alpha_c0_offset;
6445     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6446        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6447         return;
6448
6449     if( IS_INTRA(mb_type) ) {
6450         int16_t bS4[4] = {4,4,4,4};
6451         int16_t bS3[4] = {3,3,3,3};
6452         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6453         if( IS_8x8DCT(mb_type) ) {
6454             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6455             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6456             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6457             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6458         } else {
6459             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6460             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6461             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6462             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6463             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6464             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6465             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6466             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6467         }
6468         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6469         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6470         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6471         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6472         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6473         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6474         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6475         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6476         return;
6477     } else {
6478         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6479         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6480         int edges;
6481         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6482             edges = 4;
6483             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6484         } else {
6485             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6486                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6487             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6488                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6489                              ? 3 : 0;
6490             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6491             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6492             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6493                                               (h->slice_type == FF_B_TYPE), edges, step, mask_edge0, mask_edge1 );
6494         }
6495         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6496             bSv[0][0] = 0x0004000400040004ULL;
6497         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6498             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6499
6500 #define FILTER(hv,dir,edge)\
6501         if(bSv[dir][edge]) {\
6502             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6503             if(!(edge&1)) {\
6504                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6505                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6506             }\
6507         }
6508         if( edges == 1 ) {
6509             FILTER(v,0,0);
6510             FILTER(h,1,0);
6511         } else if( IS_8x8DCT(mb_type) ) {
6512             FILTER(v,0,0);
6513             FILTER(v,0,2);
6514             FILTER(h,1,0);
6515             FILTER(h,1,2);
6516         } else {
6517             FILTER(v,0,0);
6518             FILTER(v,0,1);
6519             FILTER(v,0,2);
6520             FILTER(v,0,3);
6521             FILTER(h,1,0);
6522             FILTER(h,1,1);
6523             FILTER(h,1,2);
6524             FILTER(h,1,3);
6525         }
6526 #undef FILTER
6527     }
6528 }
6529
6530 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6531     MpegEncContext * const s = &h->s;
6532     const int mb_xy= mb_x + mb_y*s->mb_stride;
6533     const int mb_type = s->current_picture.mb_type[mb_xy];
6534     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6535     int first_vertical_edge_done = 0;
6536     int dir;
6537     /* FIXME: A given frame may occupy more than one position in
6538      * the reference list. So ref2frm should be populated with
6539      * frame numbers, not indexes. */
6540     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6541                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6542
6543     //for sufficiently low qp, filtering wouldn't do anything
6544     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6545     if(!FRAME_MBAFF){
6546         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6547         int qp = s->current_picture.qscale_table[mb_xy];
6548         if(qp <= qp_thresh
6549            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6550            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6551             return;
6552         }
6553     }
6554
6555     if (FRAME_MBAFF
6556             // left mb is in picture
6557             && h->slice_table[mb_xy-1] != 255
6558             // and current and left pair do not have the same interlaced type
6559             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6560             // and left mb is in the same slice if deblocking_filter == 2
6561             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6562         /* First vertical edge is different in MBAFF frames
6563          * There are 8 different bS to compute and 2 different Qp
6564          */
6565         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6566         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6567         int16_t bS[8];
6568         int qp[2];
6569         int bqp[2];
6570         int rqp[2];
6571         int mb_qp, mbn0_qp, mbn1_qp;
6572         int i;
6573         first_vertical_edge_done = 1;
6574
6575         if( IS_INTRA(mb_type) )
6576             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6577         else {
6578             for( i = 0; i < 8; i++ ) {
6579                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6580
6581                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6582                     bS[i] = 4;
6583                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6584                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6585                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6586                     bS[i] = 2;
6587                 else
6588                     bS[i] = 1;
6589             }
6590         }
6591
6592         mb_qp = s->current_picture.qscale_table[mb_xy];
6593         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6594         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6595         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6596         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6597                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6598         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6599                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6600         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6601         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6602                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6603         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6604                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6605
6606         /* Filter edge */
6607         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6608         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6609         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6610         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6611         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6612     }
6613     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6614     for( dir = 0; dir < 2; dir++ )
6615     {
6616         int edge;
6617         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6618         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6619         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6620
6621         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6622                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6623         // how often to recheck mv-based bS when iterating between edges
6624         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6625                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6626         // how often to recheck mv-based bS when iterating along each edge
6627         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6628
6629         if (first_vertical_edge_done) {
6630             start = 1;
6631             first_vertical_edge_done = 0;
6632         }
6633
6634         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6635             start = 1;
6636
6637         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6638             && !IS_INTERLACED(mb_type)
6639             && IS_INTERLACED(mbm_type)
6640             ) {
6641             // This is a special case in the norm where the filtering must
6642             // be done twice (one each of the field) even if we are in a
6643             // frame macroblock.
6644             //
6645             static const int nnz_idx[4] = {4,5,6,3};
6646             unsigned int tmp_linesize   = 2 *   linesize;
6647             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6648             int mbn_xy = mb_xy - 2 * s->mb_stride;
6649             int qp;
6650             int i, j;
6651             int16_t bS[4];
6652
6653             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6654                 if( IS_INTRA(mb_type) ||
6655                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6656                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6657                 } else {
6658                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6659                     for( i = 0; i < 4; i++ ) {
6660                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6661                             mbn_nnz[nnz_idx[i]] != 0 )
6662                             bS[i] = 2;
6663                         else
6664                             bS[i] = 1;
6665                     }
6666                 }
6667                 // Do not use s->qscale as luma quantizer because it has not the same
6668                 // value in IPCM macroblocks.
6669                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6670                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6671                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6672                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6673                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6674                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6675                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6676                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6677             }
6678
6679             start = 1;
6680         }
6681
6682         /* Calculate bS */
6683         for( edge = start; edge < edges; edge++ ) {
6684             /* mbn_xy: neighbor macroblock */
6685             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6686             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6687             int16_t bS[4];
6688             int qp;
6689
6690             if( (edge&1) && IS_8x8DCT(mb_type) )
6691                 continue;
6692
6693             if( IS_INTRA(mb_type) ||
6694                 IS_INTRA(mbn_type) ) {
6695                 int value;
6696                 if (edge == 0) {
6697                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6698                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6699                     ) {
6700                         value = 4;
6701                     } else {
6702                         value = 3;
6703                     }
6704                 } else {
6705                     value = 3;
6706                 }
6707                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6708             } else {
6709                 int i, l;
6710                 int mv_done;
6711
6712                 if( edge & mask_edge ) {
6713                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6714                     mv_done = 1;
6715                 }
6716                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6717                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6718                     mv_done = 1;
6719                 }
6720                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6721                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6722                     int bn_idx= b_idx - (dir ? 8:1);
6723                     int v = 0;
6724                     for( l = 0; !v && l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6725                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6726                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6727                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6728                     }
6729                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6730                     mv_done = 1;
6731                 }
6732                 else
6733                     mv_done = 0;
6734
6735                 for( i = 0; i < 4; i++ ) {
6736                     int x = dir == 0 ? edge : i;
6737                     int y = dir == 0 ? i    : edge;
6738                     int b_idx= 8 + 4 + x + 8*y;
6739                     int bn_idx= b_idx - (dir ? 8:1);
6740
6741                     if( h->non_zero_count_cache[b_idx] != 0 ||
6742                         h->non_zero_count_cache[bn_idx] != 0 ) {
6743                         bS[i] = 2;
6744                     }
6745                     else if(!mv_done)
6746                     {
6747                         bS[i] = 0;
6748                         for( l = 0; l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6749                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6750                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6751                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6752                                 bS[i] = 1;
6753                                 break;
6754                             }
6755                         }
6756                     }
6757                 }
6758
6759                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6760                     continue;
6761             }
6762
6763             /* Filter edge */
6764             // Do not use s->qscale as luma quantizer because it has not the same
6765             // value in IPCM macroblocks.
6766             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6767             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6768             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6769             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6770             if( dir == 0 ) {
6771                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6772                 if( (edge&1) == 0 ) {
6773                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6774                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6775                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6776                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6777                 }
6778             } else {
6779                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6780                 if( (edge&1) == 0 ) {
6781                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6782                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6783                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6784                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6785                 }
6786             }
6787         }
6788     }
6789 }
6790
6791 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6792     MpegEncContext * const s = &h->s;
6793     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6794
6795     s->mb_skip_run= -1;
6796
6797     if( h->pps.cabac ) {
6798         int i;
6799
6800         /* realign */
6801         align_get_bits( &s->gb );
6802
6803         /* init cabac */
6804         ff_init_cabac_states( &h->cabac);
6805         ff_init_cabac_decoder( &h->cabac,
6806                                s->gb.buffer + get_bits_count(&s->gb)/8,
6807                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6808         /* calculate pre-state */
6809         for( i= 0; i < 460; i++ ) {
6810             int pre;
6811             if( h->slice_type == FF_I_TYPE )
6812                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6813             else
6814                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6815
6816             if( pre <= 63 )
6817                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6818             else
6819                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6820         }
6821
6822         for(;;){
6823 //START_TIMER
6824             int ret = decode_mb_cabac(h);
6825             int eos;
6826 //STOP_TIMER("decode_mb_cabac")
6827
6828             if(ret>=0) hl_decode_mb(h);
6829
6830             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6831                 s->mb_y++;
6832
6833                 if(ret>=0) ret = decode_mb_cabac(h);
6834
6835                 if(ret>=0) hl_decode_mb(h);
6836                 s->mb_y--;
6837             }
6838             eos = get_cabac_terminate( &h->cabac );
6839
6840             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6841                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6842                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6843                 return -1;
6844             }
6845
6846             if( ++s->mb_x >= s->mb_width ) {
6847                 s->mb_x = 0;
6848                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6849                 ++s->mb_y;
6850                 if(FIELD_OR_MBAFF_PICTURE) {
6851                     ++s->mb_y;
6852                 }
6853             }
6854
6855             if( eos || s->mb_y >= s->mb_height ) {
6856                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6857                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6858                 return 0;
6859             }
6860         }
6861
6862     } else {
6863         for(;;){
6864             int ret = decode_mb_cavlc(h);
6865
6866             if(ret>=0) hl_decode_mb(h);
6867
6868             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6869                 s->mb_y++;
6870                 ret = decode_mb_cavlc(h);
6871
6872                 if(ret>=0) hl_decode_mb(h);
6873                 s->mb_y--;
6874             }
6875
6876             if(ret<0){
6877                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6878                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6879
6880                 return -1;
6881             }
6882
6883             if(++s->mb_x >= s->mb_width){
6884                 s->mb_x=0;
6885                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6886                 ++s->mb_y;
6887                 if(FIELD_OR_MBAFF_PICTURE) {
6888                     ++s->mb_y;
6889                 }
6890                 if(s->mb_y >= s->mb_height){
6891                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6892
6893                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6894                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6895
6896                         return 0;
6897                     }else{
6898                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6899
6900                         return -1;
6901                     }
6902                 }
6903             }
6904
6905             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6906                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6907                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6908                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6909
6910                     return 0;
6911                 }else{
6912                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6913
6914                     return -1;
6915                 }
6916             }
6917         }
6918     }
6919
6920 #if 0
6921     for(;s->mb_y < s->mb_height; s->mb_y++){
6922         for(;s->mb_x < s->mb_width; s->mb_x++){
6923             int ret= decode_mb(h);
6924
6925             hl_decode_mb(h);
6926
6927             if(ret<0){
6928                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6929                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6930
6931                 return -1;
6932             }
6933
6934             if(++s->mb_x >= s->mb_width){
6935                 s->mb_x=0;
6936                 if(++s->mb_y >= s->mb_height){
6937                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6938                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6939
6940                         return 0;
6941                     }else{
6942                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6943
6944                         return -1;
6945                     }
6946                 }
6947             }
6948
6949             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6950                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6951                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6952
6953                     return 0;
6954                 }else{
6955                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6956
6957                     return -1;
6958                 }
6959             }
6960         }
6961         s->mb_x=0;
6962         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6963     }
6964 #endif
6965     return -1; //not reached
6966 }
6967
6968 static int decode_unregistered_user_data(H264Context *h, int size){
6969     MpegEncContext * const s = &h->s;
6970     uint8_t user_data[16+256];
6971     int e, build, i;
6972
6973     if(size<16)
6974         return -1;
6975
6976     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6977         user_data[i]= get_bits(&s->gb, 8);
6978     }
6979
6980     user_data[i]= 0;
6981     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6982     if(e==1 && build>=0)
6983         h->x264_build= build;
6984
6985     if(s->avctx->debug & FF_DEBUG_BUGS)
6986         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6987
6988     for(; i<size; i++)
6989         skip_bits(&s->gb, 8);
6990
6991     return 0;
6992 }
6993
6994 static int decode_sei(H264Context *h){
6995     MpegEncContext * const s = &h->s;
6996
6997     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6998         int size, type;
6999
7000         type=0;
7001         do{
7002             type+= show_bits(&s->gb, 8);
7003         }while(get_bits(&s->gb, 8) == 255);
7004
7005         size=0;
7006         do{
7007             size+= show_bits(&s->gb, 8);
7008         }while(get_bits(&s->gb, 8) == 255);
7009
7010         switch(type){
7011         case 5:
7012             if(decode_unregistered_user_data(h, size) < 0)
7013                 return -1;
7014             break;
7015         default:
7016             skip_bits(&s->gb, 8*size);
7017         }
7018
7019         //FIXME check bits here
7020         align_get_bits(&s->gb);
7021     }
7022
7023     return 0;
7024 }
7025
7026 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7027     MpegEncContext * const s = &h->s;
7028     int cpb_count, i;
7029     cpb_count = get_ue_golomb(&s->gb) + 1;
7030     get_bits(&s->gb, 4); /* bit_rate_scale */
7031     get_bits(&s->gb, 4); /* cpb_size_scale */
7032     for(i=0; i<cpb_count; i++){
7033         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7034         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7035         get_bits1(&s->gb);     /* cbr_flag */
7036     }
7037     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7038     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7039     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7040     get_bits(&s->gb, 5); /* time_offset_length */
7041 }
7042
7043 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7044     MpegEncContext * const s = &h->s;
7045     int aspect_ratio_info_present_flag;
7046     unsigned int aspect_ratio_idc;
7047     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7048
7049     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7050
7051     if( aspect_ratio_info_present_flag ) {
7052         aspect_ratio_idc= get_bits(&s->gb, 8);
7053         if( aspect_ratio_idc == EXTENDED_SAR ) {
7054             sps->sar.num= get_bits(&s->gb, 16);
7055             sps->sar.den= get_bits(&s->gb, 16);
7056         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7057             sps->sar=  pixel_aspect[aspect_ratio_idc];
7058         }else{
7059             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7060             return -1;
7061         }
7062     }else{
7063         sps->sar.num=
7064         sps->sar.den= 0;
7065     }
7066 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7067
7068     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7069         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7070     }
7071
7072     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7073         get_bits(&s->gb, 3);    /* video_format */
7074         get_bits1(&s->gb);      /* video_full_range_flag */
7075         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7076             get_bits(&s->gb, 8); /* colour_primaries */
7077             get_bits(&s->gb, 8); /* transfer_characteristics */
7078             get_bits(&s->gb, 8); /* matrix_coefficients */
7079         }
7080     }
7081
7082     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7083         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7084         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7085     }
7086
7087     sps->timing_info_present_flag = get_bits1(&s->gb);
7088     if(sps->timing_info_present_flag){
7089         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7090         sps->time_scale = get_bits_long(&s->gb, 32);
7091         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7092     }
7093
7094     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7095     if(nal_hrd_parameters_present_flag)
7096         decode_hrd_parameters(h, sps);
7097     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7098     if(vcl_hrd_parameters_present_flag)
7099         decode_hrd_parameters(h, sps);
7100     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7101         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7102     get_bits1(&s->gb);         /* pic_struct_present_flag */
7103
7104     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7105     if(sps->bitstream_restriction_flag){
7106         unsigned int num_reorder_frames;
7107         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7108         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7109         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7110         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7111         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7112         num_reorder_frames= get_ue_golomb(&s->gb);
7113         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7114
7115         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7116             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7117             return -1;
7118         }
7119
7120         sps->num_reorder_frames= num_reorder_frames;
7121     }
7122
7123     return 0;
7124 }
7125
7126 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7127                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7128     MpegEncContext * const s = &h->s;
7129     int i, last = 8, next = 8;
7130     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7131     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7132         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7133     else
7134     for(i=0;i<size;i++){
7135         if(next)
7136             next = (last + get_se_golomb(&s->gb)) & 0xff;
7137         if(!i && !next){ /* matrix not written, we use the preset one */
7138             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7139             break;
7140         }
7141         last = factors[scan[i]] = next ? next : last;
7142     }
7143 }
7144
7145 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7146                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7147     MpegEncContext * const s = &h->s;
7148     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7149     const uint8_t *fallback[4] = {
7150         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7151         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7152         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7153         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7154     };
7155     if(get_bits1(&s->gb)){
7156         sps->scaling_matrix_present |= is_sps;
7157         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7158         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7159         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7160         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7161         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7162         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7163         if(is_sps || pps->transform_8x8_mode){
7164             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7165             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7166         }
7167     } else if(fallback_sps) {
7168         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7169         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7170     }
7171 }
7172
7173 /**
7174  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7175  */
7176 static void *
7177 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7178                     const size_t size, const char *name)
7179 {
7180     if(id>=max) {
7181         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7182         return NULL;
7183     }
7184
7185     if(!vec[id]) {
7186         vec[id] = av_mallocz(size);
7187         if(vec[id] == NULL)
7188             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7189     }
7190     return vec[id];
7191 }
7192
7193 static inline int decode_seq_parameter_set(H264Context *h){
7194     MpegEncContext * const s = &h->s;
7195     int profile_idc, level_idc;
7196     unsigned int sps_id, tmp, mb_width, mb_height;
7197     int i;
7198     SPS *sps;
7199
7200     profile_idc= get_bits(&s->gb, 8);
7201     get_bits1(&s->gb);   //constraint_set0_flag
7202     get_bits1(&s->gb);   //constraint_set1_flag
7203     get_bits1(&s->gb);   //constraint_set2_flag
7204     get_bits1(&s->gb);   //constraint_set3_flag
7205     get_bits(&s->gb, 4); // reserved
7206     level_idc= get_bits(&s->gb, 8);
7207     sps_id= get_ue_golomb(&s->gb);
7208
7209     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7210     if(sps == NULL)
7211         return -1;
7212
7213     sps->profile_idc= profile_idc;
7214     sps->level_idc= level_idc;
7215
7216     if(sps->profile_idc >= 100){ //high profile
7217         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7218             get_bits1(&s->gb);  //residual_color_transform_flag
7219         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7220         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7221         sps->transform_bypass = get_bits1(&s->gb);
7222         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7223     }else
7224         sps->scaling_matrix_present = 0;
7225
7226     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7227     sps->poc_type= get_ue_golomb(&s->gb);
7228
7229     if(sps->poc_type == 0){ //FIXME #define
7230         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7231     } else if(sps->poc_type == 1){//FIXME #define
7232         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7233         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7234         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7235         tmp= get_ue_golomb(&s->gb);
7236
7237         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7238             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7239             return -1;
7240         }
7241         sps->poc_cycle_length= tmp;
7242
7243         for(i=0; i<sps->poc_cycle_length; i++)
7244             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7245     }else if(sps->poc_type != 2){
7246         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7247         return -1;
7248     }
7249
7250     tmp= get_ue_golomb(&s->gb);
7251     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7252         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7253         return -1;
7254     }
7255     sps->ref_frame_count= tmp;
7256     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7257     mb_width= get_ue_golomb(&s->gb) + 1;
7258     mb_height= get_ue_golomb(&s->gb) + 1;
7259     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7260        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7261         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7262         return -1;
7263     }
7264     sps->mb_width = mb_width;
7265     sps->mb_height= mb_height;
7266
7267     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7268     if(!sps->frame_mbs_only_flag)
7269         sps->mb_aff= get_bits1(&s->gb);
7270     else
7271         sps->mb_aff= 0;
7272
7273     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7274
7275 #ifndef ALLOW_INTERLACE
7276     if(sps->mb_aff)
7277         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7278 #endif
7279     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7280         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7281
7282     sps->crop= get_bits1(&s->gb);
7283     if(sps->crop){
7284         sps->crop_left  = get_ue_golomb(&s->gb);
7285         sps->crop_right = get_ue_golomb(&s->gb);
7286         sps->crop_top   = get_ue_golomb(&s->gb);
7287         sps->crop_bottom= get_ue_golomb(&s->gb);
7288         if(sps->crop_left || sps->crop_top){
7289             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7290         }
7291         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7292             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7293         }
7294     }else{
7295         sps->crop_left  =
7296         sps->crop_right =
7297         sps->crop_top   =
7298         sps->crop_bottom= 0;
7299     }
7300
7301     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7302     if( sps->vui_parameters_present_flag )
7303         decode_vui_parameters(h, sps);
7304
7305     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7306         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7307                sps_id, sps->profile_idc, sps->level_idc,
7308                sps->poc_type,
7309                sps->ref_frame_count,
7310                sps->mb_width, sps->mb_height,
7311                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7312                sps->direct_8x8_inference_flag ? "8B8" : "",
7313                sps->crop_left, sps->crop_right,
7314                sps->crop_top, sps->crop_bottom,
7315                sps->vui_parameters_present_flag ? "VUI" : ""
7316                );
7317     }
7318     return 0;
7319 }
7320
7321 static void
7322 build_qp_table(PPS *pps, int t, int index)
7323 {
7324     int i;
7325     for(i = 0; i < 255; i++)
7326         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7327 }
7328
7329 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7330     MpegEncContext * const s = &h->s;
7331     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7332     PPS *pps;
7333
7334     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7335     if(pps == NULL)
7336         return -1;
7337
7338     tmp= get_ue_golomb(&s->gb);
7339     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7340         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7341         return -1;
7342     }
7343     pps->sps_id= tmp;
7344
7345     pps->cabac= get_bits1(&s->gb);
7346     pps->pic_order_present= get_bits1(&s->gb);
7347     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7348     if(pps->slice_group_count > 1 ){
7349         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7350         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7351         switch(pps->mb_slice_group_map_type){
7352         case 0:
7353 #if 0
7354 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7355 |    run_length[ i ]                                |1  |ue(v)   |
7356 #endif
7357             break;
7358         case 2:
7359 #if 0
7360 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7361 |{                                                  |   |        |
7362 |    top_left_mb[ i ]                               |1  |ue(v)   |
7363 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7364 |   }                                               |   |        |
7365 #endif
7366             break;
7367         case 3:
7368         case 4:
7369         case 5:
7370 #if 0
7371 |   slice_group_change_direction_flag               |1  |u(1)    |
7372 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7373 #endif
7374             break;
7375         case 6:
7376 #if 0
7377 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7378 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7379 |)                                                  |   |        |
7380 |    slice_group_id[ i ]                            |1  |u(v)    |
7381 #endif
7382             break;
7383         }
7384     }
7385     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7386     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7387     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7388         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7389         pps->ref_count[0]= pps->ref_count[1]= 1;
7390         return -1;
7391     }
7392
7393     pps->weighted_pred= get_bits1(&s->gb);
7394     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7395     pps->init_qp= get_se_golomb(&s->gb) + 26;
7396     pps->init_qs= get_se_golomb(&s->gb) + 26;
7397     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7398     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7399     pps->constrained_intra_pred= get_bits1(&s->gb);
7400     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7401
7402     pps->transform_8x8_mode= 0;
7403     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7404     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7405     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7406
7407     if(get_bits_count(&s->gb) < bit_length){
7408         pps->transform_8x8_mode= get_bits1(&s->gb);
7409         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7410         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7411     } else {
7412         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7413     }
7414
7415     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7416     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7417         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7418         h->pps.chroma_qp_diff= 1;
7419     } else
7420         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7421
7422     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7423         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7424                pps_id, pps->sps_id,
7425                pps->cabac ? "CABAC" : "CAVLC",
7426                pps->slice_group_count,
7427                pps->ref_count[0], pps->ref_count[1],
7428                pps->weighted_pred ? "weighted" : "",
7429                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7430                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7431                pps->constrained_intra_pred ? "CONSTR" : "",
7432                pps->redundant_pic_cnt_present ? "REDU" : "",
7433                pps->transform_8x8_mode ? "8x8DCT" : ""
7434                );
7435     }
7436
7437     return 0;
7438 }
7439
7440 /**
7441  * Call decode_slice() for each context.
7442  *
7443  * @param h h264 master context
7444  * @param context_count number of contexts to execute
7445  */
7446 static void execute_decode_slices(H264Context *h, int context_count){
7447     MpegEncContext * const s = &h->s;
7448     AVCodecContext * const avctx= s->avctx;
7449     H264Context *hx;
7450     int i;
7451
7452     if(context_count == 1) {
7453         decode_slice(avctx, h);
7454     } else {
7455         for(i = 1; i < context_count; i++) {
7456             hx = h->thread_context[i];
7457             hx->s.error_resilience = avctx->error_resilience;
7458             hx->s.error_count = 0;
7459         }
7460
7461         avctx->execute(avctx, (void *)decode_slice,
7462                        (void **)h->thread_context, NULL, context_count);
7463
7464         /* pull back stuff from slices to master context */
7465         hx = h->thread_context[context_count - 1];
7466         s->mb_x = hx->s.mb_x;
7467         s->mb_y = hx->s.mb_y;
7468         s->dropable = hx->s.dropable;
7469         s->picture_structure = hx->s.picture_structure;
7470         for(i = 1; i < context_count; i++)
7471             h->s.error_count += h->thread_context[i]->s.error_count;
7472     }
7473 }
7474
7475
7476 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7477     MpegEncContext * const s = &h->s;
7478     AVCodecContext * const avctx= s->avctx;
7479     int buf_index=0;
7480     H264Context *hx; ///< thread context
7481     int context_count = 0;
7482
7483     h->max_contexts = avctx->thread_count;
7484 #if 0
7485     int i;
7486     for(i=0; i<50; i++){
7487         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7488     }
7489 #endif
7490     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7491         h->current_slice = 0;
7492         if (!s->first_field)
7493             s->current_picture_ptr= NULL;
7494     }
7495
7496     for(;;){
7497         int consumed;
7498         int dst_length;
7499         int bit_length;
7500         const uint8_t *ptr;
7501         int i, nalsize = 0;
7502         int err;
7503
7504         if(h->is_avc) {
7505             if(buf_index >= buf_size) break;
7506             nalsize = 0;
7507             for(i = 0; i < h->nal_length_size; i++)
7508                 nalsize = (nalsize << 8) | buf[buf_index++];
7509             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7510                 if(nalsize == 1){
7511                     buf_index++;
7512                     continue;
7513                 }else{
7514                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7515                     break;
7516                 }
7517             }
7518         } else {
7519             // start code prefix search
7520             for(; buf_index + 3 < buf_size; buf_index++){
7521                 // This should always succeed in the first iteration.
7522                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7523                     break;
7524             }
7525
7526             if(buf_index+3 >= buf_size) break;
7527
7528             buf_index+=3;
7529         }
7530
7531         hx = h->thread_context[context_count];
7532
7533         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7534         if (ptr==NULL || dst_length < 0){
7535             return -1;
7536         }
7537         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7538             dst_length--;
7539         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7540
7541         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7542             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7543         }
7544
7545         if (h->is_avc && (nalsize != consumed)){
7546             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7547             consumed= nalsize;
7548         }
7549
7550         buf_index += consumed;
7551
7552         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7553            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7554             continue;
7555
7556       again:
7557         err = 0;
7558         switch(hx->nal_unit_type){
7559         case NAL_IDR_SLICE:
7560             if (h->nal_unit_type != NAL_IDR_SLICE) {
7561                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7562                 return -1;
7563             }
7564             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7565         case NAL_SLICE:
7566             init_get_bits(&hx->s.gb, ptr, bit_length);
7567             hx->intra_gb_ptr=
7568             hx->inter_gb_ptr= &hx->s.gb;
7569             hx->s.data_partitioning = 0;
7570
7571             if((err = decode_slice_header(hx, h)))
7572                break;
7573
7574             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7575             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7576                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7577                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7578                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7579                && avctx->skip_frame < AVDISCARD_ALL)
7580                 context_count++;
7581             break;
7582         case NAL_DPA:
7583             init_get_bits(&hx->s.gb, ptr, bit_length);
7584             hx->intra_gb_ptr=
7585             hx->inter_gb_ptr= NULL;
7586             hx->s.data_partitioning = 1;
7587
7588             err = decode_slice_header(hx, h);
7589             break;
7590         case NAL_DPB:
7591             init_get_bits(&hx->intra_gb, ptr, bit_length);
7592             hx->intra_gb_ptr= &hx->intra_gb;
7593             break;
7594         case NAL_DPC:
7595             init_get_bits(&hx->inter_gb, ptr, bit_length);
7596             hx->inter_gb_ptr= &hx->inter_gb;
7597
7598             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7599                && s->context_initialized
7600                && s->hurry_up < 5
7601                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7602                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7603                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7604                && avctx->skip_frame < AVDISCARD_ALL)
7605                 context_count++;
7606             break;
7607         case NAL_SEI:
7608             init_get_bits(&s->gb, ptr, bit_length);
7609             decode_sei(h);
7610             break;
7611         case NAL_SPS:
7612             init_get_bits(&s->gb, ptr, bit_length);
7613             decode_seq_parameter_set(h);
7614
7615             if(s->flags& CODEC_FLAG_LOW_DELAY)
7616                 s->low_delay=1;
7617
7618             if(avctx->has_b_frames < 2)
7619                 avctx->has_b_frames= !s->low_delay;
7620             break;
7621         case NAL_PPS:
7622             init_get_bits(&s->gb, ptr, bit_length);
7623
7624             decode_picture_parameter_set(h, bit_length);
7625
7626             break;
7627         case NAL_AUD:
7628         case NAL_END_SEQUENCE:
7629         case NAL_END_STREAM:
7630         case NAL_FILLER_DATA:
7631         case NAL_SPS_EXT:
7632         case NAL_AUXILIARY_SLICE:
7633             break;
7634         default:
7635             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7636         }
7637
7638         if(context_count == h->max_contexts) {
7639             execute_decode_slices(h, context_count);
7640             context_count = 0;
7641         }
7642
7643         if (err < 0)
7644             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7645         else if(err == 1) {
7646             /* Slice could not be decoded in parallel mode, copy down
7647              * NAL unit stuff to context 0 and restart. Note that
7648              * rbsp_buffer is not transfered, but since we no longer
7649              * run in parallel mode this should not be an issue. */
7650             h->nal_unit_type = hx->nal_unit_type;
7651             h->nal_ref_idc   = hx->nal_ref_idc;
7652             hx = h;
7653             goto again;
7654         }
7655     }
7656     if(context_count)
7657         execute_decode_slices(h, context_count);
7658     return buf_index;
7659 }
7660
7661 /**
7662  * returns the number of bytes consumed for building the current frame
7663  */
7664 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7665     if(s->flags&CODEC_FLAG_TRUNCATED){
7666         pos -= s->parse_context.last_index;
7667         if(pos<0) pos=0; // FIXME remove (unneeded?)
7668
7669         return pos;
7670     }else{
7671         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7672         if(pos+10>buf_size) pos=buf_size; // oops ;)
7673
7674         return pos;
7675     }
7676 }
7677
7678 static int decode_frame(AVCodecContext *avctx,
7679                              void *data, int *data_size,
7680                              const uint8_t *buf, int buf_size)
7681 {
7682     H264Context *h = avctx->priv_data;
7683     MpegEncContext *s = &h->s;
7684     AVFrame *pict = data;
7685     int buf_index;
7686
7687     s->flags= avctx->flags;
7688     s->flags2= avctx->flags2;
7689
7690     if(s->flags&CODEC_FLAG_TRUNCATED){
7691         const int next= ff_h264_find_frame_end(h, buf, buf_size);
7692         assert((buf_size > 0) || (next == END_NOT_FOUND));
7693
7694         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7695           return buf_size;
7696 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7697     }
7698
7699    /* no supplementary picture */
7700     if (buf_size == 0) {
7701         Picture *out;
7702         int i, out_idx;
7703
7704 //FIXME factorize this with the output code below
7705         out = h->delayed_pic[0];
7706         out_idx = 0;
7707         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7708             if(h->delayed_pic[i]->poc < out->poc){
7709                 out = h->delayed_pic[i];
7710                 out_idx = i;
7711             }
7712
7713         for(i=out_idx; h->delayed_pic[i]; i++)
7714             h->delayed_pic[i] = h->delayed_pic[i+1];
7715
7716         if(out){
7717             *data_size = sizeof(AVFrame);
7718             *pict= *(AVFrame*)out;
7719         }
7720
7721         return 0;
7722     }
7723
7724     if(h->is_avc && !h->got_avcC) {
7725         int i, cnt, nalsize;
7726         unsigned char *p = avctx->extradata;
7727         if(avctx->extradata_size < 7) {
7728             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7729             return -1;
7730         }
7731         if(*p != 1) {
7732             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7733             return -1;
7734         }
7735         /* sps and pps in the avcC always have length coded with 2 bytes,
7736            so put a fake nal_length_size = 2 while parsing them */
7737         h->nal_length_size = 2;
7738         // Decode sps from avcC
7739         cnt = *(p+5) & 0x1f; // Number of sps
7740         p += 6;
7741         for (i = 0; i < cnt; i++) {
7742             nalsize = AV_RB16(p) + 2;
7743             if(decode_nal_units(h, p, nalsize) < 0) {
7744                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7745                 return -1;
7746             }
7747             p += nalsize;
7748         }
7749         // Decode pps from avcC
7750         cnt = *(p++); // Number of pps
7751         for (i = 0; i < cnt; i++) {
7752             nalsize = AV_RB16(p) + 2;
7753             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7754                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7755                 return -1;
7756             }
7757             p += nalsize;
7758         }
7759         // Now store right nal length size, that will be use to parse all other nals
7760         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7761         // Do not reparse avcC
7762         h->got_avcC = 1;
7763     }
7764
7765     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7766         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7767             return -1;
7768     }
7769
7770     buf_index=decode_nal_units(h, buf, buf_size);
7771     if(buf_index < 0)
7772         return -1;
7773
7774     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7775         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7776         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7777         return -1;
7778     }
7779
7780     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7781         Picture *out = s->current_picture_ptr;
7782         Picture *cur = s->current_picture_ptr;
7783         Picture *prev = h->delayed_output_pic;
7784         int i, pics, cross_idr, out_of_order, out_idx;
7785
7786         s->mb_y= 0;
7787
7788         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7789         s->current_picture_ptr->pict_type= s->pict_type;
7790
7791         h->prev_frame_num_offset= h->frame_num_offset;
7792         h->prev_frame_num= h->frame_num;
7793         if(!s->dropable) {
7794             h->prev_poc_msb= h->poc_msb;
7795             h->prev_poc_lsb= h->poc_lsb;
7796             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7797         }
7798
7799         /*
7800          * FIXME: Error handling code does not seem to support interlaced
7801          * when slices span multiple rows
7802          * The ff_er_add_slice calls don't work right for bottom
7803          * fields; they cause massive erroneous error concealing
7804          * Error marking covers both fields (top and bottom).
7805          * This causes a mismatched s->error_count
7806          * and a bad error table. Further, the error count goes to
7807          * INT_MAX when called for bottom field, because mb_y is
7808          * past end by one (callers fault) and resync_mb_y != 0
7809          * causes problems for the first MB line, too.
7810          */
7811         if (!FIELD_PICTURE)
7812             ff_er_frame_end(s);
7813
7814         MPV_frame_end(s);
7815
7816         if (s->first_field) {
7817             /* Wait for second field. */
7818             *data_size = 0;
7819
7820         } else {
7821             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7822             /* Derive top_field_first from field pocs. */
7823             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7824
7825         //FIXME do something with unavailable reference frames
7826
7827 #if 0 //decode order
7828             *data_size = sizeof(AVFrame);
7829 #else
7830             /* Sort B-frames into display order */
7831
7832             if(h->sps.bitstream_restriction_flag
7833                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7834                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7835                 s->low_delay = 0;
7836             }
7837
7838             pics = 0;
7839             while(h->delayed_pic[pics]) pics++;
7840
7841             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7842
7843             h->delayed_pic[pics++] = cur;
7844             if(cur->reference == 0)
7845                 cur->reference = DELAYED_PIC_REF;
7846
7847             cross_idr = 0;
7848             for(i=0; h->delayed_pic[i]; i++)
7849                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7850                     cross_idr = 1;
7851
7852             out = h->delayed_pic[0];
7853             out_idx = 0;
7854             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7855                 if(h->delayed_pic[i]->poc < out->poc){
7856                     out = h->delayed_pic[i];
7857                     out_idx = i;
7858                 }
7859
7860             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7861             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7862                 { }
7863             else if(prev && pics <= s->avctx->has_b_frames)
7864                 out = prev;
7865             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7866                || (s->low_delay &&
7867                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7868                  || cur->pict_type == FF_B_TYPE)))
7869             {
7870                 s->low_delay = 0;
7871                 s->avctx->has_b_frames++;
7872                 out = prev;
7873             }
7874             else if(out_of_order)
7875                 out = prev;
7876
7877             if(out_of_order || pics > s->avctx->has_b_frames){
7878                 for(i=out_idx; h->delayed_pic[i]; i++)
7879                     h->delayed_pic[i] = h->delayed_pic[i+1];
7880             }
7881
7882             if(prev == out)
7883                 *data_size = 0;
7884             else
7885                 *data_size = sizeof(AVFrame);
7886             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7887                 prev->reference = 0;
7888             h->delayed_output_pic = out;
7889 #endif
7890
7891             if(out)
7892                 *pict= *(AVFrame*)out;
7893             else
7894                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7895         }
7896     }
7897
7898     assert(pict->data[0] || !*data_size);
7899     ff_print_debug_info(s, pict);
7900 //printf("out %d\n", (int)pict->data[0]);
7901 #if 0 //?
7902
7903     /* Return the Picture timestamp as the frame number */
7904     /* we subtract 1 because it is added on utils.c     */
7905     avctx->frame_number = s->picture_number - 1;
7906 #endif
7907     return get_consumed_bytes(s, buf_index, buf_size);
7908 }
7909 #if 0
7910 static inline void fill_mb_avail(H264Context *h){
7911     MpegEncContext * const s = &h->s;
7912     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7913
7914     if(s->mb_y){
7915         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7916         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7917         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7918     }else{
7919         h->mb_avail[0]=
7920         h->mb_avail[1]=
7921         h->mb_avail[2]= 0;
7922     }
7923     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7924     h->mb_avail[4]= 1; //FIXME move out
7925     h->mb_avail[5]= 0; //FIXME move out
7926 }
7927 #endif
7928
7929 #ifdef TEST
7930 #undef printf
7931 #undef random
7932 #define COUNT 8000
7933 #define SIZE (COUNT*40)
7934 int main(void){
7935     int i;
7936     uint8_t temp[SIZE];
7937     PutBitContext pb;
7938     GetBitContext gb;
7939 //    int int_temp[10000];
7940     DSPContext dsp;
7941     AVCodecContext avctx;
7942
7943     dsputil_init(&dsp, &avctx);
7944
7945     init_put_bits(&pb, temp, SIZE);
7946     printf("testing unsigned exp golomb\n");
7947     for(i=0; i<COUNT; i++){
7948         START_TIMER
7949         set_ue_golomb(&pb, i);
7950         STOP_TIMER("set_ue_golomb");
7951     }
7952     flush_put_bits(&pb);
7953
7954     init_get_bits(&gb, temp, 8*SIZE);
7955     for(i=0; i<COUNT; i++){
7956         int j, s;
7957
7958         s= show_bits(&gb, 24);
7959
7960         START_TIMER
7961         j= get_ue_golomb(&gb);
7962         if(j != i){
7963             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7964 //            return -1;
7965         }
7966         STOP_TIMER("get_ue_golomb");
7967     }
7968
7969
7970     init_put_bits(&pb, temp, SIZE);
7971     printf("testing signed exp golomb\n");
7972     for(i=0; i<COUNT; i++){
7973         START_TIMER
7974         set_se_golomb(&pb, i - COUNT/2);
7975         STOP_TIMER("set_se_golomb");
7976     }
7977     flush_put_bits(&pb);
7978
7979     init_get_bits(&gb, temp, 8*SIZE);
7980     for(i=0; i<COUNT; i++){
7981         int j, s;
7982
7983         s= show_bits(&gb, 24);
7984
7985         START_TIMER
7986         j= get_se_golomb(&gb);
7987         if(j != i - COUNT/2){
7988             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7989 //            return -1;
7990         }
7991         STOP_TIMER("get_se_golomb");
7992     }
7993
7994 #if 0
7995     printf("testing 4x4 (I)DCT\n");
7996
7997     DCTELEM block[16];
7998     uint8_t src[16], ref[16];
7999     uint64_t error= 0, max_error=0;
8000
8001     for(i=0; i<COUNT; i++){
8002         int j;
8003 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8004         for(j=0; j<16; j++){
8005             ref[j]= random()%255;
8006             src[j]= random()%255;
8007         }
8008
8009         h264_diff_dct_c(block, src, ref, 4);
8010
8011         //normalize
8012         for(j=0; j<16; j++){
8013 //            printf("%d ", block[j]);
8014             block[j]= block[j]*4;
8015             if(j&1) block[j]= (block[j]*4 + 2)/5;
8016             if(j&4) block[j]= (block[j]*4 + 2)/5;
8017         }
8018 //        printf("\n");
8019
8020         s->dsp.h264_idct_add(ref, block, 4);
8021 /*        for(j=0; j<16; j++){
8022             printf("%d ", ref[j]);
8023         }
8024         printf("\n");*/
8025
8026         for(j=0; j<16; j++){
8027             int diff= FFABS(src[j] - ref[j]);
8028
8029             error+= diff*diff;
8030             max_error= FFMAX(max_error, diff);
8031         }
8032     }
8033     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8034     printf("testing quantizer\n");
8035     for(qp=0; qp<52; qp++){
8036         for(i=0; i<16; i++)
8037             src1_block[i]= src2_block[i]= random()%255;
8038
8039     }
8040     printf("Testing NAL layer\n");
8041
8042     uint8_t bitstream[COUNT];
8043     uint8_t nal[COUNT*2];
8044     H264Context h;
8045     memset(&h, 0, sizeof(H264Context));
8046
8047     for(i=0; i<COUNT; i++){
8048         int zeros= i;
8049         int nal_length;
8050         int consumed;
8051         int out_length;
8052         uint8_t *out;
8053         int j;
8054
8055         for(j=0; j<COUNT; j++){
8056             bitstream[j]= (random() % 255) + 1;
8057         }
8058
8059         for(j=0; j<zeros; j++){
8060             int pos= random() % COUNT;
8061             while(bitstream[pos] == 0){
8062                 pos++;
8063                 pos %= COUNT;
8064             }
8065             bitstream[pos]=0;
8066         }
8067
8068         START_TIMER
8069
8070         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8071         if(nal_length<0){
8072             printf("encoding failed\n");
8073             return -1;
8074         }
8075
8076         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8077
8078         STOP_TIMER("NAL")
8079
8080         if(out_length != COUNT){
8081             printf("incorrect length %d %d\n", out_length, COUNT);
8082             return -1;
8083         }
8084
8085         if(consumed != nal_length){
8086             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8087             return -1;
8088         }
8089
8090         if(memcmp(bitstream, out, COUNT)){
8091             printf("mismatch\n");
8092             return -1;
8093         }
8094     }
8095 #endif
8096
8097     printf("Testing RBSP\n");
8098
8099
8100     return 0;
8101 }
8102 #endif /* TEST */
8103
8104
8105 static av_cold int decode_end(AVCodecContext *avctx)
8106 {
8107     H264Context *h = avctx->priv_data;
8108     MpegEncContext *s = &h->s;
8109
8110     av_freep(&h->rbsp_buffer[0]);
8111     av_freep(&h->rbsp_buffer[1]);
8112     free_tables(h); //FIXME cleanup init stuff perhaps
8113     MPV_common_end(s);
8114
8115 //    memset(h, 0, sizeof(H264Context));
8116
8117     return 0;
8118 }
8119
8120
8121 AVCodec h264_decoder = {
8122     "h264",
8123     CODEC_TYPE_VIDEO,
8124     CODEC_ID_H264,
8125     sizeof(H264Context),
8126     decode_init,
8127     NULL,
8128     decode_end,
8129     decode_frame,
8130     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8131     .flush= flush_dpb,
8132     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8133 };
8134
8135 #include "svq3.c"