git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64
  65 static av_always_inline uint32_t pack16to32(int a, int b){
  66 #ifdef WORDS_BIGENDIAN
  67    return (b&0xFFFF) + (a<<16);
  68 #else
  69    return (a&0xFFFF) + (b<<16);
  70 #endif
  71 }
  72
  73 const uint8_t ff_rem6[52]={
  74 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  75 };
  76
  77 const uint8_t ff_div6[52]={
  78 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  79 };
  80
  81
  82 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  83     MpegEncContext * const s = &h->s;
  84     const int mb_xy= h->mb_xy;
  85     int topleft_xy, top_xy, topright_xy, left_xy[2];
  86     int topleft_type, top_type, topright_type, left_type[2];
  87     int left_block[8];
  88     int topleft_partition= -1;
  89     int i;
  90
  91     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  92
  93     //FIXME deblocking could skip the intra and nnz parts.
  94     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  95         return;
  96
  97     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  98      * stuff, I can't imagine that these complex rules are worth it. */
  99
 100     topleft_xy = top_xy - 1;
 101     topright_xy= top_xy + 1;
 102     left_xy[1] = left_xy[0] = mb_xy-1;
 103     left_block[0]= 0;
 104     left_block[1]= 1;
 105     left_block[2]= 2;
 106     left_block[3]= 3;
 107     left_block[4]= 7;
 108     left_block[5]= 10;
 109     left_block[6]= 8;
 110     left_block[7]= 11;
 111     if(FRAME_MBAFF){
 112         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 113         const int top_pair_xy      = pair_xy     - s->mb_stride;
 114         const int topleft_pair_xy  = top_pair_xy - 1;
 115         const int topright_pair_xy = top_pair_xy + 1;
 116         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 117         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 118         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 119         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 120         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 121         const int bottom = (s->mb_y & 1);
 122         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 123         if (bottom
 124                 ? !curr_mb_frame_flag // bottom macroblock
 125                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 126                 ) {
 127             top_xy -= s->mb_stride;
 128         }
 129         if (bottom
 130                 ? !curr_mb_frame_flag // bottom macroblock
 131                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 132                 ) {
 133             topleft_xy -= s->mb_stride;
 134         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 135             topleft_xy += s->mb_stride;
 136             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 137             topleft_partition = 0;
 138         }
 139         if (bottom
 140                 ? !curr_mb_frame_flag // bottom macroblock
 141                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 142                 ) {
 143             topright_xy -= s->mb_stride;
 144         }
 145         if (left_mb_frame_flag != curr_mb_frame_flag) {
 146             left_xy[1] = left_xy[0] = pair_xy - 1;
 147             if (curr_mb_frame_flag) {
 148                 if (bottom) {
 149                     left_block[0]= 2;
 150                     left_block[1]= 2;
 151                     left_block[2]= 3;
 152                     left_block[3]= 3;
 153                     left_block[4]= 8;
 154                     left_block[5]= 11;
 155                     left_block[6]= 8;
 156                     left_block[7]= 11;
 157                 } else {
 158                     left_block[0]= 0;
 159                     left_block[1]= 0;
 160                     left_block[2]= 1;
 161                     left_block[3]= 1;
 162                     left_block[4]= 7;
 163                     left_block[5]= 10;
 164                     left_block[6]= 7;
 165                     left_block[7]= 10;
 166                 }
 167             } else {
 168                 left_xy[1] += s->mb_stride;
 169                 //left_block[0]= 0;
 170                 left_block[1]= 2;
 171                 left_block[2]= 0;
 172                 left_block[3]= 2;
 173                 //left_block[4]= 7;
 174                 left_block[5]= 10;
 175                 left_block[6]= 7;
 176                 left_block[7]= 10;
 177             }
 178         }
 179     }
 180
 181     h->top_mb_xy = top_xy;
 182     h->left_mb_xy[0] = left_xy[0];
 183     h->left_mb_xy[1] = left_xy[1];
 184     if(for_deblock){
 185         topleft_type = 0;
 186         topright_type = 0;
 187         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 188         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 189         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 190
 191         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 192             int list;
 193             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 194             for(i=0; i<16; i++)
 195                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 196             for(list=0; list<h->list_count; list++){
 197                 if(USES_LIST(mb_type,list)){
 198                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 199                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 200                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 201                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 202                         dst[0] = src[0];
 203                         dst[1] = src[1];
 204                         dst[2] = src[2];
 205                         dst[3] = src[3];
 206                     }
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 212                 }else{
 213                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 214                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 215                 }
 216             }
 217         }
 218     }else{
 219         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 220         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 221         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 222         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 223         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 224     }
 225
 226     if(IS_INTRA(mb_type)){
 227         h->topleft_samples_available=
 228         h->top_samples_available=
 229         h->left_samples_available= 0xFFFF;
 230         h->topright_samples_available= 0xEEEA;
 231
 232         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 233             h->topleft_samples_available= 0xB3FF;
 234             h->top_samples_available= 0x33FF;
 235             h->topright_samples_available= 0x26EA;
 236         }
 237         for(i=0; i<2; i++){
 238             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 239                 h->topleft_samples_available&= 0xDF5F;
 240                 h->left_samples_available&= 0x5F5F;
 241             }
 242         }
 243
 244         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 245             h->topleft_samples_available&= 0x7FFF;
 246
 247         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 248             h->topright_samples_available&= 0xFBFF;
 249
 250         if(IS_INTRA4x4(mb_type)){
 251             if(IS_INTRA4x4(top_type)){
 252                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 253                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 254                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 255                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 256             }else{
 257                 int pred;
 258                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 259                     pred= -1;
 260                 else{
 261                     pred= 2;
 262                 }
 263                 h->intra4x4_pred_mode_cache[4+8*0]=
 264                 h->intra4x4_pred_mode_cache[5+8*0]=
 265                 h->intra4x4_pred_mode_cache[6+8*0]=
 266                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 267             }
 268             for(i=0; i<2; i++){
 269                 if(IS_INTRA4x4(left_type[i])){
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 272                 }else{
 273                     int pred;
 274                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 275                         pred= -1;
 276                     else{
 277                         pred= 2;
 278                     }
 279                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 280                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 281                 }
 282             }
 283         }
 284     }
 285
 286
 287 /*
 288 0 . T T. T T T T
 289 1 L . .L . . . .
 290 2 L . .L . . . .
 291 3 . T TL . . . .
 292 4 L . .L . . . .
 293 5 L . .. . . . .
 294 */
 295 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 296     if(top_type){
 297         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 298         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 299         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 300         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 301
 302         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 303         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 304
 305         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 306         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 307
 308     }else{
 309         h->non_zero_count_cache[4+8*0]=
 310         h->non_zero_count_cache[5+8*0]=
 311         h->non_zero_count_cache[6+8*0]=
 312         h->non_zero_count_cache[7+8*0]=
 313
 314         h->non_zero_count_cache[1+8*0]=
 315         h->non_zero_count_cache[2+8*0]=
 316
 317         h->non_zero_count_cache[1+8*3]=
 318         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 319
 320     }
 321
 322     for (i=0; i<2; i++) {
 323         if(left_type[i]){
 324             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 325             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 326             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 327             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 328         }else{
 329             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 330             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 331             h->non_zero_count_cache[0+8*1 +   8*i]=
 332             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 333         }
 334     }
 335
 336     if( h->pps.cabac ) {
 337         // top_cbp
 338         if(top_type) {
 339             h->top_cbp = h->cbp_table[top_xy];
 340         } else if(IS_INTRA(mb_type)) {
 341             h->top_cbp = 0x1C0;
 342         } else {
 343             h->top_cbp = 0;
 344         }
 345         // left_cbp
 346         if (left_type[0]) {
 347             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 348         } else if(IS_INTRA(mb_type)) {
 349             h->left_cbp = 0x1C0;
 350         } else {
 351             h->left_cbp = 0;
 352         }
 353         if (left_type[0]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 355         }
 356         if (left_type[1]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 358         }
 359     }
 360
 361 #if 1
 362     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 363         int list;
 364         for(list=0; list<h->list_count; list++){
 365             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 366                 /*if(!h->mv_cache_clean[list]){
 367                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 368                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 369                     h->mv_cache_clean[list]= 1;
 370                 }*/
 371                 continue;
 372             }
 373             h->mv_cache_clean[list]= 0;
 374
 375             if(USES_LIST(top_type, list)){
 376                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 377                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 382                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 383                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 384                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 385                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 386             }else{
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 391                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 392             }
 393
 394             for(i=0; i<2; i++){
 395                 int cache_idx = scan8[0] - 1 + i*2*8;
 396                 if(USES_LIST(left_type[i], list)){
 397                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 398                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 399                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 400                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 401                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 402                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 403                 }else{
 404                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 405                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 406                     h->ref_cache[list][cache_idx  ]=
 407                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 408                 }
 409             }
 410
 411             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 412                 continue;
 413
 414             if(USES_LIST(topleft_type, list)){
 415                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 416                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 419             }else{
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 422             }
 423
 424             if(USES_LIST(topright_type, list)){
 425                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 426                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 427                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 429             }else{
 430                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 432             }
 433
 434             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 435                 continue;
 436
 437             h->ref_cache[list][scan8[5 ]+1] =
 438             h->ref_cache[list][scan8[7 ]+1] =
 439             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 440             h->ref_cache[list][scan8[4 ]] =
 441             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 442             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 443             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 445             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 446             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 447
 448             if( h->pps.cabac ) {
 449                 /* XXX beurk, Load mvd */
 450                 if(USES_LIST(top_type, list)){
 451                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 456                 }else{
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 461                 }
 462                 if(USES_LIST(left_type[0], list)){
 463                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 466                 }else{
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 469                 }
 470                 if(USES_LIST(left_type[1], list)){
 471                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 474                 }else{
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 477                 }
 478                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 481                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 483
 484                 if(h->slice_type_nos == FF_B_TYPE){
 485                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 486
 487                     if(IS_DIRECT(top_type)){
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 489                     }else if(IS_8X8(top_type)){
 490                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 491                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 492                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 493                     }else{
 494                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 495                     }
 496
 497                     if(IS_DIRECT(left_type[0]))
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 499                     else if(IS_8X8(left_type[0]))
 500                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 501                     else
 502                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 503
 504                     if(IS_DIRECT(left_type[1]))
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 506                     else if(IS_8X8(left_type[1]))
 507                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 508                     else
 509                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 510                 }
 511             }
 512
 513             if(FRAME_MBAFF){
 514 #define MAP_MVS\
 515                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 516                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 521                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 522                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 524                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 525                 if(MB_FIELD){
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] <<= 1;\
 529                         h->mv_cache[list][idx][1] /= 2;\
 530                         h->mvd_cache[list][idx][1] /= 2;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }else{
 535 #define MAP_F2F(idx, mb_type)\
 536                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 537                         h->ref_cache[list][idx] >>= 1;\
 538                         h->mv_cache[list][idx][1] <<= 1;\
 539                         h->mvd_cache[list][idx][1] <<= 1;\
 540                     }
 541                     MAP_MVS
 542 #undef MAP_F2F
 543                 }
 544             }
 545         }
 546     }
 547 #endif
 548
 549     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 550 }
 551
 552 static inline void write_back_intra_pred_mode(H264Context *h){
 553     const int mb_xy= h->mb_xy;
 554
 555     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 556     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 557     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 558     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 559     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 560     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 561     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 562 }
 563
 564 /**
 565  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 566  */
 567 static inline int check_intra4x4_pred_mode(H264Context *h){
 568     MpegEncContext * const s = &h->s;
 569     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 570     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 571     int i;
 572
 573     if(!(h->top_samples_available&0x8000)){
 574         for(i=0; i<4; i++){
 575             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 576             if(status<0){
 577                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 578                 return -1;
 579             } else if(status){
 580                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 581             }
 582         }
 583     }
 584
 585     if(!(h->left_samples_available&0x8000)){
 586         for(i=0; i<4; i++){
 587             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 588             if(status<0){
 589                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 590                 return -1;
 591             } else if(status){
 592                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if(!(h->left_samples_available&0x8000)){
 622         mode= left[ mode ];
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665
 666     if(FRAME_MBAFF){
 667         // store all luma nnzs, for deblocking
 668         int v = 0, i;
 669         for(i=0; i<16; i++)
 670             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 671         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 672     }
 673 }
 674
 675 /**
 676  * gets the predicted number of non zero coefficients.
 677  * @param n block index
 678  */
 679 static inline int pred_non_zero_count(H264Context *h, int n){
 680     const int index8= scan8[n];
 681     const int left= h->non_zero_count_cache[index8 - 1];
 682     const int top = h->non_zero_count_cache[index8 - 8];
 683     int i= left + top;
 684
 685     if(i<64) i= (i+1)>>1;
 686
 687     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 688
 689     return i&31;
 690 }
 691
 692 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 693     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 694     MpegEncContext *s = &h->s;
 695
 696     /* there is no consistent mapping of mvs to neighboring locations that will
 697      * make mbaff happy, so we can't move all this logic to fill_caches */
 698     if(FRAME_MBAFF){
 699         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 700         const int16_t *mv;
 701         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 702         *C = h->mv_cache[list][scan8[0]-2];
 703
 704         if(!MB_FIELD
 705            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 706             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 707             if(IS_INTERLACED(mb_types[topright_xy])){
 708 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 709                 const int x4 = X4, y4 = Y4;\
 710                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 711                 if(!USES_LIST(mb_type,list))\
 712                     return LIST_NOT_USED;\
 713                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 714                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 715                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 716                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 717
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 719             }
 720         }
 721         if(topright_ref == PART_NOT_AVAILABLE
 722            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 723            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 724             if(!MB_FIELD
 725                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 726                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 727             }
 728             if(MB_FIELD
 729                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 730                && i >= scan8[0]+8){
 731                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 732                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 733             }
 734         }
 735 #undef SET_DIAG_MV
 736     }
 737
 738     if(topright_ref != PART_NOT_AVAILABLE){
 739         *C= h->mv_cache[list][ i - 8 + part_width ];
 740         return topright_ref;
 741     }else{
 742         tprintf(s->avctx, "topright MV not available\n");
 743
 744         *C= h->mv_cache[list][ i - 8 - 1 ];
 745         return h->ref_cache[list][ i - 8 - 1 ];
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted MV.
 751  * @param n the block index
 752  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 753  * @param mx the x component of the predicted motion vector
 754  * @param my the y component of the predicted motion vector
 755  */
 756 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 757     const int index8= scan8[n];
 758     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 759     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 760     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 761     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 762     const int16_t * C;
 763     int diagonal_ref, match_count;
 764
 765     assert(part_width==1 || part_width==2 || part_width==4);
 766
 767 /* mv_cache
 768   B . . A T T T T
 769   U . . L . . , .
 770   U . . L . . . .
 771   U . . L . . , .
 772   . . . L . . . .
 773 */
 774
 775     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 776     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 777     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 778     if(match_count > 1){ //most common
 779         *mx= mid_pred(A[0], B[0], C[0]);
 780         *my= mid_pred(A[1], B[1], C[1]);
 781     }else if(match_count==1){
 782         if(left_ref==ref){
 783             *mx= A[0];
 784             *my= A[1];
 785         }else if(top_ref==ref){
 786             *mx= B[0];
 787             *my= B[1];
 788         }else{
 789             *mx= C[0];
 790             *my= C[1];
 791         }
 792     }else{
 793         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 794             *mx= A[0];
 795             *my= A[1];
 796         }else{
 797             *mx= mid_pred(A[0], B[0], C[0]);
 798             *my= mid_pred(A[1], B[1], C[1]);
 799         }
 800     }
 801
 802     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 803 }
 804
 805 /**
 806  * gets the directionally predicted 16x8 MV.
 807  * @param n the block index
 808  * @param mx the x component of the predicted motion vector
 809  * @param my the y component of the predicted motion vector
 810  */
 811 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 812     if(n==0){
 813         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 814         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 815
 816         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 817
 818         if(top_ref == ref){
 819             *mx= B[0];
 820             *my= B[1];
 821             return;
 822         }
 823     }else{
 824         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 825         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 826
 827         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 828
 829         if(left_ref == ref){
 830             *mx= A[0];
 831             *my= A[1];
 832             return;
 833         }
 834     }
 835
 836     //RARE
 837     pred_motion(h, n, 4, list, ref, mx, my);
 838 }
 839
 840 /**
 841  * gets the directionally predicted 8x16 MV.
 842  * @param n the block index
 843  * @param mx the x component of the predicted motion vector
 844  * @param my the y component of the predicted motion vector
 845  */
 846 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 847     if(n==0){
 848         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 849         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 850
 851         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 852
 853         if(left_ref == ref){
 854             *mx= A[0];
 855             *my= A[1];
 856             return;
 857         }
 858     }else{
 859         const int16_t * C;
 860         int diagonal_ref;
 861
 862         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 863
 864         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 865
 866         if(diagonal_ref == ref){
 867             *mx= C[0];
 868             *my= C[1];
 869             return;
 870         }
 871     }
 872
 873     //RARE
 874     pred_motion(h, n, 2, list, ref, mx, my);
 875 }
 876
 877 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 878     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 879     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 880
 881     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 882
 883     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 884        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 885        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 886
 887         *mx = *my = 0;
 888         return;
 889     }
 890
 891     pred_motion(h, 0, 4, 0, 0, mx, my);
 892
 893     return;
 894 }
 895
 896 static inline void direct_dist_scale_factor(H264Context * const h){
 897     const int poc = h->s.current_picture_ptr->poc;
 898     const int poc1 = h->ref_list[1][0].poc;
 899     int i;
 900     for(i=0; i<h->ref_count[0]; i++){
 901         int poc0 = h->ref_list[0][i].poc;
 902         int td = av_clip(poc1 - poc0, -128, 127);
 903         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 904             h->dist_scale_factor[i] = 256;
 905         }else{
 906             int tb = av_clip(poc - poc0, -128, 127);
 907             int tx = (16384 + (FFABS(td) >> 1)) / td;
 908             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 909         }
 910     }
 911     if(FRAME_MBAFF){
 912         for(i=0; i<h->ref_count[0]; i++){
 913             h->dist_scale_factor_field[2*i] =
 914             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 915         }
 916     }
 917 }
 918 static inline void direct_ref_list_init(H264Context * const h){
 919     MpegEncContext * const s = &h->s;
 920     Picture * const ref1 = &h->ref_list[1][0];
 921     Picture * const cur = s->current_picture_ptr;
 922     int list, i, j;
 923     if(cur->pict_type == FF_I_TYPE)
 924         cur->ref_count[0] = 0;
 925     if(cur->pict_type != FF_B_TYPE)
 926         cur->ref_count[1] = 0;
 927     for(list=0; list<2; list++){
 928         cur->ref_count[list] = h->ref_count[list];
 929         for(j=0; j<h->ref_count[list]; j++)
 930             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 931     }
 932     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 933         return;
 934     for(list=0; list<2; list++){
 935         for(i=0; i<ref1->ref_count[list]; i++){
 936             const int poc = ref1->ref_poc[list][i];
 937             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 938             for(j=0; j<h->ref_count[list]; j++)
 939                 if(h->ref_list[list][j].poc == poc){
 940                     h->map_col_to_list0[list][i] = j;
 941                     break;
 942                 }
 943         }
 944     }
 945     if(FRAME_MBAFF){
 946         for(list=0; list<2; list++){
 947             for(i=0; i<ref1->ref_count[list]; i++){
 948                 j = h->map_col_to_list0[list][i];
 949                 h->map_col_to_list0_field[list][2*i] = 2*j;
 950                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 951             }
 952         }
 953     }
 954 }
 955
 956 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 957     MpegEncContext * const s = &h->s;
 958     const int mb_xy =   h->mb_xy;
 959     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 960     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 961     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 962     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 963     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 964     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 965     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 966     const int is_b8x8 = IS_8X8(*mb_type);
 967     unsigned int sub_mb_type;
 968     int i8, i4;
 969
 970 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 971     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 972         /* FIXME save sub mb types from previous frames (or derive from MVs)
 973          * so we know exactly what block size to use */
 974         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 975         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 976     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 977         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 978         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 979     }else{
 980         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 981         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 982     }
 983     if(!is_b8x8)
 984         *mb_type |= MB_TYPE_DIRECT2;
 985     if(MB_FIELD)
 986         *mb_type |= MB_TYPE_INTERLACED;
 987
 988     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 989
 990     if(h->direct_spatial_mv_pred){
 991         int ref[2];
 992         int mv[2][2];
 993         int list;
 994
 995         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 996
 997         /* ref = min(neighbors) */
 998         for(list=0; list<2; list++){
 999             int refa = h->ref_cache[list][scan8[0] - 1];
1000             int refb = h->ref_cache[list][scan8[0] - 8];
1001             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1002             if(refc == -2)
1003                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1004             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1005             if(ref[list] < 0)
1006                 ref[list] = -1;
1007         }
1008
1009         if(ref[0] < 0 && ref[1] < 0){
1010             ref[0] = ref[1] = 0;
1011             mv[0][0] = mv[0][1] =
1012             mv[1][0] = mv[1][1] = 0;
1013         }else{
1014             for(list=0; list<2; list++){
1015                 if(ref[list] >= 0)
1016                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1017                 else
1018                     mv[list][0] = mv[list][1] = 0;
1019             }
1020         }
1021
1022         if(ref[1] < 0){
1023             if(!is_b8x8)
1024                 *mb_type &= ~MB_TYPE_L1;
1025             sub_mb_type &= ~MB_TYPE_L1;
1026         }else if(ref[0] < 0){
1027             if(!is_b8x8)
1028                 *mb_type &= ~MB_TYPE_L0;
1029             sub_mb_type &= ~MB_TYPE_L0;
1030         }
1031
1032         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1033             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1034             int mb_types_col[2];
1035             int b8_stride = h->b8_stride;
1036             int b4_stride = h->b_stride;
1037
1038             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1039
1040             if(IS_INTERLACED(*mb_type)){
1041                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1042                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1043                 if(s->mb_y&1){
1044                     l1ref0 -= 2*b8_stride;
1045                     l1ref1 -= 2*b8_stride;
1046                     l1mv0 -= 4*b4_stride;
1047                     l1mv1 -= 4*b4_stride;
1048                 }
1049                 b8_stride *= 3;
1050                 b4_stride *= 6;
1051             }else{
1052                 int cur_poc = s->current_picture_ptr->poc;
1053                 int *col_poc = h->ref_list[1]->field_poc;
1054                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1055                 int dy = 2*col_parity - (s->mb_y&1);
1056                 mb_types_col[0] =
1057                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1058                 l1ref0 += dy*b8_stride;
1059                 l1ref1 += dy*b8_stride;
1060                 l1mv0 += 2*dy*b4_stride;
1061                 l1mv1 += 2*dy*b4_stride;
1062                 b8_stride = 0;
1063             }
1064
1065             for(i8=0; i8<4; i8++){
1066                 int x8 = i8&1;
1067                 int y8 = i8>>1;
1068                 int xy8 = x8+y8*b8_stride;
1069                 int xy4 = 3*x8+y8*b4_stride;
1070                 int a=0, b=0;
1071
1072                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1073                     continue;
1074                 h->sub_mb_type[i8] = sub_mb_type;
1075
1076                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1077                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1078                 if(!IS_INTRA(mb_types_col[y8])
1079                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1080                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1081                     if(ref[0] > 0)
1082                         a= pack16to32(mv[0][0],mv[0][1]);
1083                     if(ref[1] > 0)
1084                         b= pack16to32(mv[1][0],mv[1][1]);
1085                 }else{
1086                     a= pack16to32(mv[0][0],mv[0][1]);
1087                     b= pack16to32(mv[1][0],mv[1][1]);
1088                 }
1089                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1090                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1091             }
1092         }else if(IS_16X16(*mb_type)){
1093             int a=0, b=0;
1094
1095             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1096             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1097             if(!IS_INTRA(mb_type_col)
1098                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1099                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1100                        && (h->x264_build>33 || !h->x264_build)))){
1101                 if(ref[0] > 0)
1102                     a= pack16to32(mv[0][0],mv[0][1]);
1103                 if(ref[1] > 0)
1104                     b= pack16to32(mv[1][0],mv[1][1]);
1105             }else{
1106                 a= pack16to32(mv[0][0],mv[0][1]);
1107                 b= pack16to32(mv[1][0],mv[1][1]);
1108             }
1109             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1110             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1111         }else{
1112             for(i8=0; i8<4; i8++){
1113                 const int x8 = i8&1;
1114                 const int y8 = i8>>1;
1115
1116                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1117                     continue;
1118                 h->sub_mb_type[i8] = sub_mb_type;
1119
1120                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1121                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1122                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1123                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1124
1125                 /* col_zero_flag */
1126                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1127                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1128                                                   && (h->x264_build>33 || !h->x264_build)))){
1129                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1130                     if(IS_SUB_8X8(sub_mb_type)){
1131                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1132                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1133                             if(ref[0] == 0)
1134                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1135                             if(ref[1] == 0)
1136                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1137                         }
1138                     }else
1139                     for(i4=0; i4<4; i4++){
1140                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1141                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1142                             if(ref[0] == 0)
1143                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1144                             if(ref[1] == 0)
1145                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1146                         }
1147                     }
1148                 }
1149             }
1150         }
1151     }else{ /* direct temporal mv pred */
1152         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1153         const int *dist_scale_factor = h->dist_scale_factor;
1154
1155         if(FRAME_MBAFF){
1156             if(IS_INTERLACED(*mb_type)){
1157                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1158                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1159                 dist_scale_factor = h->dist_scale_factor_field;
1160             }
1161             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1162                 /* FIXME assumes direct_8x8_inference == 1 */
1163                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1164                 int mb_types_col[2];
1165                 int y_shift;
1166
1167                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1168                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1169                          | (*mb_type & MB_TYPE_INTERLACED);
1170                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1171
1172                 if(IS_INTERLACED(*mb_type)){
1173                     /* frame to field scaling */
1174                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1175                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1176                     if(s->mb_y&1){
1177                         l1ref0 -= 2*h->b8_stride;
1178                         l1ref1 -= 2*h->b8_stride;
1179                         l1mv0 -= 4*h->b_stride;
1180                         l1mv1 -= 4*h->b_stride;
1181                     }
1182                     y_shift = 0;
1183
1184                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1185                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1186                        && !is_b8x8)
1187                         *mb_type |= MB_TYPE_16x8;
1188                     else
1189                         *mb_type |= MB_TYPE_8x8;
1190                 }else{
1191                     /* field to frame scaling */
1192                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1193                      * but in MBAFF, top and bottom POC are equal */
1194                     int dy = (s->mb_y&1) ? 1 : 2;
1195                     mb_types_col[0] =
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     l1ref0 += dy*h->b8_stride;
1198                     l1ref1 += dy*h->b8_stride;
1199                     l1mv0 += 2*dy*h->b_stride;
1200                     l1mv1 += 2*dy*h->b_stride;
1201                     y_shift = 2;
1202
1203                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1204                        && !is_b8x8)
1205                         *mb_type |= MB_TYPE_16x16;
1206                     else
1207                         *mb_type |= MB_TYPE_8x8;
1208                 }
1209
1210                 for(i8=0; i8<4; i8++){
1211                     const int x8 = i8&1;
1212                     const int y8 = i8>>1;
1213                     int ref0, scale;
1214                     const int16_t (*l1mv)[2]= l1mv0;
1215
1216                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                         continue;
1218                     h->sub_mb_type[i8] = sub_mb_type;
1219
1220                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1221                     if(IS_INTRA(mb_types_col[y8])){
1222                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1223                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         continue;
1226                     }
1227
1228                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1229                     if(ref0 >= 0)
1230                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1231                     else{
1232                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1233                         l1mv= l1mv1;
1234                     }
1235                     scale = dist_scale_factor[ref0];
1236                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237
1238                     {
1239                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1240                         int my_col = (mv_col[1]<<y_shift)/2;
1241                         int mx = (scale * mv_col[0] + 128) >> 8;
1242                         int my = (scale * my_col + 128) >> 8;
1243                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1244                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1245                     }
1246                 }
1247                 return;
1248             }
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col)){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1261                                                 : map_col_to_list0[1][l1ref1[0]];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col)){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * idct tranforms the 16 dc values and dequantize them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * dct tranforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale & 0xff];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         done = 1;
1958
1959         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1960                  &chroma_dc_coeff_token_len [0], 1, 1,
1961                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1962
1963         for(i=0; i<4; i++){
1964             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1965                      &coeff_token_len [i][0], 1, 1,
1966                      &coeff_token_bits[i][0], 1, 1, 1);
1967         }
1968
1969         for(i=0; i<3; i++){
1970             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1971                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1972                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1973         }
1974         for(i=0; i<15; i++){
1975             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1976                      &total_zeros_len [i][0], 1, 1,
1977                      &total_zeros_bits[i][0], 1, 1, 1);
1978         }
1979
1980         for(i=0; i<6; i++){
1981             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1982                      &run_len [i][0], 1, 1,
1983                      &run_bits[i][0], 1, 1, 1);
1984         }
1985         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1986                  &run_len [6][0], 1, 1,
1987                  &run_bits[6][0], 1, 1, 1);
1988     }
1989 }
1990
1991 static void free_tables(H264Context *h){
1992     int i;
1993     H264Context *hx;
1994     av_freep(&h->intra4x4_pred_mode);
1995     av_freep(&h->chroma_pred_mode_table);
1996     av_freep(&h->cbp_table);
1997     av_freep(&h->mvd_table[0]);
1998     av_freep(&h->mvd_table[1]);
1999     av_freep(&h->direct_table);
2000     av_freep(&h->non_zero_count);
2001     av_freep(&h->slice_table_base);
2002     h->slice_table= NULL;
2003
2004     av_freep(&h->mb2b_xy);
2005     av_freep(&h->mb2b8_xy);
2006
2007     for(i = 0; i < MAX_SPS_COUNT; i++)
2008         av_freep(h->sps_buffers + i);
2009
2010     for(i = 0; i < MAX_PPS_COUNT; i++)
2011         av_freep(h->pps_buffers + i);
2012
2013     for(i = 0; i < h->s.avctx->thread_count; i++) {
2014         hx = h->thread_context[i];
2015         if(!hx) continue;
2016         av_freep(&hx->top_borders[1]);
2017         av_freep(&hx->top_borders[0]);
2018         av_freep(&hx->s.obmc_scratchpad);
2019     }
2020 }
2021
2022 static void init_dequant8_coeff_table(H264Context *h){
2023     int i,q,x;
2024     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2025     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2026     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2027
2028     for(i=0; i<2; i++ ){
2029         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2030             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2031             break;
2032         }
2033
2034         for(q=0; q<52; q++){
2035             int shift = ff_div6[q];
2036             int idx = ff_rem6[q];
2037             for(x=0; x<64; x++)
2038                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2039                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2040                     h->pps.scaling_matrix8[i][x]) << shift;
2041         }
2042     }
2043 }
2044
2045 static void init_dequant4_coeff_table(H264Context *h){
2046     int i,j,q,x;
2047     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2048     for(i=0; i<6; i++ ){
2049         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2050         for(j=0; j<i; j++){
2051             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2052                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2053                 break;
2054             }
2055         }
2056         if(j<i)
2057             continue;
2058
2059         for(q=0; q<52; q++){
2060             int shift = ff_div6[q] + 2;
2061             int idx = ff_rem6[q];
2062             for(x=0; x<16; x++)
2063                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2064                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2065                     h->pps.scaling_matrix4[i][x]) << shift;
2066         }
2067     }
2068 }
2069
2070 static void init_dequant_tables(H264Context *h){
2071     int i,x;
2072     init_dequant4_coeff_table(h);
2073     if(h->pps.transform_8x8_mode)
2074         init_dequant8_coeff_table(h);
2075     if(h->sps.transform_bypass){
2076         for(i=0; i<6; i++)
2077             for(x=0; x<16; x++)
2078                 h->dequant4_coeff[i][0][x] = 1<<6;
2079         if(h->pps.transform_8x8_mode)
2080             for(i=0; i<2; i++)
2081                 for(x=0; x<64; x++)
2082                     h->dequant8_coeff[i][0][x] = 1<<6;
2083     }
2084 }
2085
2086
2087 /**
2088  * allocates tables.
2089  * needs width/height
2090  */
2091 static int alloc_tables(H264Context *h){
2092     MpegEncContext * const s = &h->s;
2093     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2094     int x,y;
2095
2096     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2097
2098     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2099     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2101
2102     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2104     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2106
2107     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2108     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2109
2110     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2111     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2112     for(y=0; y<s->mb_height; y++){
2113         for(x=0; x<s->mb_width; x++){
2114             const int mb_xy= x + y*s->mb_stride;
2115             const int b_xy = 4*x + 4*y*h->b_stride;
2116             const int b8_xy= 2*x + 2*y*h->b8_stride;
2117
2118             h->mb2b_xy [mb_xy]= b_xy;
2119             h->mb2b8_xy[mb_xy]= b8_xy;
2120         }
2121     }
2122
2123     s->obmc_scratchpad = NULL;
2124
2125     if(!h->dequant4_coeff[0])
2126         init_dequant_tables(h);
2127
2128     return 0;
2129 fail:
2130     free_tables(h);
2131     return -1;
2132 }
2133
2134 /**
2135  * Mimic alloc_tables(), but for every context thread.
2136  */
2137 static void clone_tables(H264Context *dst, H264Context *src){
2138     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2139     dst->non_zero_count           = src->non_zero_count;
2140     dst->slice_table              = src->slice_table;
2141     dst->cbp_table                = src->cbp_table;
2142     dst->mb2b_xy                  = src->mb2b_xy;
2143     dst->mb2b8_xy                 = src->mb2b8_xy;
2144     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2145     dst->mvd_table[0]             = src->mvd_table[0];
2146     dst->mvd_table[1]             = src->mvd_table[1];
2147     dst->direct_table             = src->direct_table;
2148
2149     dst->s.obmc_scratchpad = NULL;
2150     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2151 }
2152
2153 /**
2154  * Init context
2155  * Allocate buffers which are not shared amongst multiple threads.
2156  */
2157 static int context_init(H264Context *h){
2158     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2159     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160
2161     return 0;
2162 fail:
2163     return -1; // free_tables will clean up for us
2164 }
2165
2166 static av_cold void common_init(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168
2169     s->width = s->avctx->width;
2170     s->height = s->avctx->height;
2171     s->codec_id= s->avctx->codec->id;
2172
2173     ff_h264_pred_init(&h->hpc, s->codec_id);
2174
2175     h->dequant_coeff_pps= -1;
2176     s->unrestricted_mv=1;
2177     s->decode=1; //FIXME
2178
2179     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2180     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     s->low_delay= 1;
2199
2200     if(avctx->codec_id == CODEC_ID_SVQ3)
2201         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2202     else
2203         avctx->pix_fmt= PIX_FMT_YUV420P;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     return 0;
2217 }
2218
2219 static int frame_start(H264Context *h){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222
2223     if(MPV_frame_start(s, s->avctx) < 0)
2224         return -1;
2225     ff_er_frame_start(s);
2226     /*
2227      * MPV_frame_start uses pict_type to derive key_frame.
2228      * This is incorrect for H.264; IDR markings must be used.
2229      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2230      * See decode_nal_units().
2231      */
2232     s->current_picture_ptr->key_frame= 0;
2233
2234     assert(s->linesize && s->uvlinesize);
2235
2236     for(i=0; i<16; i++){
2237         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2239     }
2240     for(i=0; i<4; i++){
2241         h->block_offset[16+i]=
2242         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2243         h->block_offset[24+16+i]=
2244         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2245     }
2246
2247     /* can't be in alloc_tables because linesize isn't known there.
2248      * FIXME: redo bipred weight to not require extra buffer? */
2249     for(i = 0; i < s->avctx->thread_count; i++)
2250         if(!h->thread_context[i]->s.obmc_scratchpad)
2251             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2252
2253     /* some macroblocks will be accessed before they're available */
2254     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2255         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2256
2257 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2258
2259     // We mark the current picture as non reference after allocating it, so
2260     // that if we break out due to an error it can be released automatically
2261     // in the next MPV_frame_start().
2262     // SVQ3 as well as most other codecs have only last/next/current and thus
2263     // get released even with set reference, besides SVQ3 and others do not
2264     // mark frames as reference later "naturally".
2265     if(s->codec_id != CODEC_ID_SVQ3)
2266         s->current_picture_ptr->reference= 0;
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273
2274     src_y  -=   linesize;
2275     src_cb -= uvlinesize;
2276     src_cr -= uvlinesize;
2277
2278     // There are two lines saved, the line above the the top macroblock of a pair,
2279     // and the line above the bottom macroblock
2280     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2281     for(i=1; i<17; i++){
2282         h->left_border[i]= src_y[15+i*  linesize];
2283     }
2284
2285     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2286     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2287
2288     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2289         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2290         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2291         for(i=1; i<9; i++){
2292             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2293             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2294         }
2295         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2296         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2297     }
2298 }
2299
2300 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2301     MpegEncContext * const s = &h->s;
2302     int temp8, i;
2303     uint64_t temp64;
2304     int deblock_left;
2305     int deblock_top;
2306     int mb_xy;
2307
2308     if(h->deblocking_filter == 2) {
2309         mb_xy = h->mb_xy;
2310         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2311         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2312     } else {
2313         deblock_left = (s->mb_x > 0);
2314         deblock_top =  (s->mb_y > 0);
2315     }
2316
2317     src_y  -=   linesize + 1;
2318     src_cb -= uvlinesize + 1;
2319     src_cr -= uvlinesize + 1;
2320
2321 #define XCHG(a,b,t,xchg)\
2322 t= a;\
2323 if(xchg)\
2324     a= b;\
2325 b= t;
2326
2327     if(deblock_left){
2328         for(i = !deblock_top; i<17; i++){
2329             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2330         }
2331     }
2332
2333     if(deblock_top){
2334         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2335         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2336         if(s->mb_x+1 < s->mb_width){
2337             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2338         }
2339     }
2340
2341     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2342         if(deblock_left){
2343             for(i = !deblock_top; i<9; i++){
2344                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2345                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2346             }
2347         }
2348         if(deblock_top){
2349             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2350             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2351         }
2352     }
2353 }
2354
2355 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2356     MpegEncContext * const s = &h->s;
2357     int i;
2358
2359     src_y  -= 2 *   linesize;
2360     src_cb -= 2 * uvlinesize;
2361     src_cr -= 2 * uvlinesize;
2362
2363     // There are two lines saved, the line above the the top macroblock of a pair,
2364     // and the line above the bottom macroblock
2365     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2366     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2367     for(i=2; i<34; i++){
2368         h->left_border[i]= src_y[15+i*  linesize];
2369     }
2370
2371     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2372     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2373     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2374     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2375
2376     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2377         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2378         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2379         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2380         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2381         for(i=2; i<18; i++){
2382             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2383             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2384         }
2385         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2386         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2387         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2388         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2389     }
2390 }
2391
2392 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2393     MpegEncContext * const s = &h->s;
2394     int temp8, i;
2395     uint64_t temp64;
2396     int deblock_left = (s->mb_x > 0);
2397     int deblock_top  = (s->mb_y > 1);
2398
2399     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2400
2401     src_y  -= 2 *   linesize + 1;
2402     src_cb -= 2 * uvlinesize + 1;
2403     src_cr -= 2 * uvlinesize + 1;
2404
2405 #define XCHG(a,b,t,xchg)\
2406 t= a;\
2407 if(xchg)\
2408     a= b;\
2409 b= t;
2410
2411     if(deblock_left){
2412         for(i = (!deblock_top)<<1; i<34; i++){
2413             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2414         }
2415     }
2416
2417     if(deblock_top){
2418         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2419         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2420         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2421         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2422         if(s->mb_x+1 < s->mb_width){
2423             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2424             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2425         }
2426     }
2427
2428     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2429         if(deblock_left){
2430             for(i = (!deblock_top) << 1; i<18; i++){
2431                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2432                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2433             }
2434         }
2435         if(deblock_top){
2436             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2437             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2438             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2439             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2440         }
2441     }
2442 }
2443
2444 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2445     MpegEncContext * const s = &h->s;
2446     const int mb_x= s->mb_x;
2447     const int mb_y= s->mb_y;
2448     const int mb_xy= h->mb_xy;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450     uint8_t  *dest_y, *dest_cb, *dest_cr;
2451     int linesize, uvlinesize /*dct_offset*/;
2452     int i;
2453     int *block_offset = &h->block_offset[0];
2454     const unsigned int bottom = mb_y & 1;
2455     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2456     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2457     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2458
2459     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2460     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2461     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2462
2463     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2464     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2465
2466     if (!simple && MB_FIELD) {
2467         linesize   = h->mb_linesize   = s->linesize * 2;
2468         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2469         block_offset = &h->block_offset[24];
2470         if(mb_y&1){ //FIXME move out of this func?
2471             dest_y -= s->linesize*15;
2472             dest_cb-= s->uvlinesize*7;
2473             dest_cr-= s->uvlinesize*7;
2474         }
2475         if(FRAME_MBAFF) {
2476             int list;
2477             for(list=0; list<h->list_count; list++){
2478                 if(!USES_LIST(mb_type, list))
2479                     continue;
2480                 if(IS_16X16(mb_type)){
2481                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2482                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2483                 }else{
2484                     for(i=0; i<16; i+=4){
2485                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2486                         int ref = h->ref_cache[list][scan8[i]];
2487                         if(ref >= 0)
2488                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2489                     }
2490                 }
2491             }
2492         }
2493     } else {
2494         linesize   = h->mb_linesize   = s->linesize;
2495         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2496 //        dct_offset = s->linesize * 16;
2497     }
2498
2499     if(transform_bypass){
2500         idct_dc_add =
2501         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2502     }else if(IS_8x8DCT(mb_type)){
2503         idct_dc_add = s->dsp.h264_idct8_dc_add;
2504         idct_add = s->dsp.h264_idct8_add;
2505     }else{
2506         idct_dc_add = s->dsp.h264_idct_dc_add;
2507         idct_add = s->dsp.h264_idct_add;
2508     }
2509
2510     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2511        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2512         int mbt_y = mb_y&~1;
2513         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2514         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2515         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2516         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2517     }
2518
2519     if (!simple && IS_INTRA_PCM(mb_type)) {
2520         unsigned int x, y;
2521
2522         // The pixels are stored in h->mb array in the same order as levels,
2523         // copy them in output in the correct order.
2524         for(i=0; i<16; i++) {
2525             for (y=0; y<4; y++) {
2526                 for (x=0; x<4; x++) {
2527                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2528                 }
2529             }
2530         }
2531         for(i=16; i<16+4; i++) {
2532             for (y=0; y<4; y++) {
2533                 for (x=0; x<4; x++) {
2534                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2535                 }
2536             }
2537         }
2538         for(i=20; i<20+4; i++) {
2539             for (y=0; y<4; y++) {
2540                 for (x=0; x<4; x++) {
2541                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2542                 }
2543             }
2544         }
2545     } else {
2546         if(IS_INTRA(mb_type)){
2547             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2548                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2549
2550             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2551                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2552                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2553             }
2554
2555             if(IS_INTRA4x4(mb_type)){
2556                 if(simple || !s->encoding){
2557                     if(IS_8x8DCT(mb_type)){
2558                         for(i=0; i<16; i+=4){
2559                             uint8_t * const ptr= dest_y + block_offset[i];
2560                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2561                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2562                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2563                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2564                             if(nnz){
2565                                 if(nnz == 1 && h->mb[i*16])
2566                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2567                                 else
2568                                     idct_add(ptr, h->mb + i*16, linesize);
2569                             }
2570                         }
2571                     }else
2572                     for(i=0; i<16; i++){
2573                         uint8_t * const ptr= dest_y + block_offset[i];
2574                         uint8_t *topright;
2575                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2576                         int nnz, tr;
2577
2578                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2579                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2580                             assert(mb_y || linesize <= block_offset[i]);
2581                             if(!topright_avail){
2582                                 tr= ptr[3 - linesize]*0x01010101;
2583                                 topright= (uint8_t*) &tr;
2584                             }else
2585                                 topright= ptr + 4 - linesize;
2586                         }else
2587                             topright= NULL;
2588
2589                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2590                         nnz = h->non_zero_count_cache[ scan8[i] ];
2591                         if(nnz){
2592                             if(is_h264){
2593                                 if(nnz == 1 && h->mb[i*16])
2594                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2595                                 else
2596                                     idct_add(ptr, h->mb + i*16, linesize);
2597                             }else
2598                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2599                         }
2600                     }
2601                 }
2602             }else{
2603                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2604                 if(is_h264){
2605                     if(!transform_bypass)
2606                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2607                 }else
2608                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2609             }
2610             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2611                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2612         }else if(is_h264){
2613             hl_motion(h, dest_y, dest_cb, dest_cr,
2614                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2615                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2616                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2617         }
2618
2619
2620         if(!IS_INTRA4x4(mb_type)){
2621             if(is_h264){
2622                 if(IS_INTRA16x16(mb_type)){
2623                     for(i=0; i<16; i++){
2624                         if(h->non_zero_count_cache[ scan8[i] ])
2625                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2626                         else if(h->mb[i*16])
2627                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2628                     }
2629                 }else{
2630                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2631                     for(i=0; i<16; i+=di){
2632                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2633                         if(nnz){
2634                             if(nnz==1 && h->mb[i*16])
2635                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2636                             else
2637                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2638                         }
2639                     }
2640                 }
2641             }else{
2642                 for(i=0; i<16; i++){
2643                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2644                         uint8_t * const ptr= dest_y + block_offset[i];
2645                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2646                     }
2647                 }
2648             }
2649         }
2650
2651         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2652             uint8_t *dest[2] = {dest_cb, dest_cr};
2653             if(transform_bypass){
2654                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2655             }else{
2656                 idct_add = s->dsp.h264_idct_add;
2657                 idct_dc_add = s->dsp.h264_idct_dc_add;
2658                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2659                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2660             }
2661             if(is_h264){
2662                 for(i=16; i<16+8; i++){
2663                     if(h->non_zero_count_cache[ scan8[i] ])
2664                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2665                     else if(h->mb[i*16])
2666                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2667                 }
2668             }else{
2669                 for(i=16; i<16+8; i++){
2670                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2671                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2672                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2673                     }
2674                 }
2675             }
2676         }
2677     }
2678     if(h->deblocking_filter) {
2679         if (!simple && FRAME_MBAFF) {
2680             //FIXME try deblocking one mb at a time?
2681             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2682             const int mb_y = s->mb_y - 1;
2683             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2684             const int mb_xy= mb_x + mb_y*s->mb_stride;
2685             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2686             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2687             if (!bottom) return;
2688             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2689             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2690             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2691
2692             if(IS_INTRA(mb_type_top | mb_type_bottom))
2693                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2694
2695             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2696             // deblock a pair
2697             // top
2698             s->mb_y--; h->mb_xy -= s->mb_stride;
2699             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2700             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2701             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2702             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2703             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2704             // bottom
2705             s->mb_y++; h->mb_xy += s->mb_stride;
2706             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2707             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2708             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2709             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2710             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2711         } else {
2712             tprintf(h->s.avctx, "call filter_mb\n");
2713             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2714             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2715             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2716             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2717             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2718         }
2719     }
2720 }
2721
2722 /**
2723  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2724  */
2725 static void hl_decode_mb_simple(H264Context *h){
2726     hl_decode_mb_internal(h, 1);
2727 }
2728
2729 /**
2730  * Process a macroblock; this handles edge cases, such as interlacing.
2731  */
2732 static void av_noinline hl_decode_mb_complex(H264Context *h){
2733     hl_decode_mb_internal(h, 0);
2734 }
2735
2736 static void hl_decode_mb(H264Context *h){
2737     MpegEncContext * const s = &h->s;
2738     const int mb_xy= h->mb_xy;
2739     const int mb_type= s->current_picture.mb_type[mb_xy];
2740     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2741                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2742
2743     if(ENABLE_H264_ENCODER && !s->decode)
2744         return;
2745
2746     if (is_complex)
2747         hl_decode_mb_complex(h);
2748     else hl_decode_mb_simple(h);
2749 }
2750
2751 static void pic_as_field(Picture *pic, const int parity){
2752     int i;
2753     for (i = 0; i < 4; ++i) {
2754         if (parity == PICT_BOTTOM_FIELD)
2755             pic->data[i] += pic->linesize[i];
2756         pic->reference = parity;
2757         pic->linesize[i] *= 2;
2758     }
2759 }
2760
2761 static int split_field_copy(Picture *dest, Picture *src,
2762                             int parity, int id_add){
2763     int match = !!(src->reference & parity);
2764
2765     if (match) {
2766         *dest = *src;
2767         pic_as_field(dest, parity);
2768         dest->pic_id *= 2;
2769         dest->pic_id += id_add;
2770     }
2771
2772     return match;
2773 }
2774
2775 /**
2776  * Split one reference list into field parts, interleaving by parity
2777  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2778  * set to look at the actual start of data for that field.
2779  *
2780  * @param dest output list
2781  * @param dest_len maximum number of fields to put in dest
2782  * @param src the source reference list containing fields and/or field pairs
2783  *            (aka short_ref/long_ref, or
2784  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2785  * @param src_len number of Picture's in source (pairs and unmatched fields)
2786  * @param parity the parity of the picture being decoded/needing
2787  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2788  * @return number of fields placed in dest
2789  */
2790 static int split_field_half_ref_list(Picture *dest, int dest_len,
2791                                      Picture *src,  int src_len,  int parity){
2792     int same_parity   = 1;
2793     int same_i        = 0;
2794     int opp_i         = 0;
2795     int out_i;
2796     int field_output;
2797
2798     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2799         if (same_parity && same_i < src_len) {
2800             field_output = split_field_copy(dest + out_i, src + same_i,
2801                                             parity, 1);
2802             same_parity = !field_output;
2803             same_i++;
2804
2805         } else if (opp_i < src_len) {
2806             field_output = split_field_copy(dest + out_i, src + opp_i,
2807                                             PICT_FRAME - parity, 0);
2808             same_parity = field_output;
2809             opp_i++;
2810
2811         } else {
2812             break;
2813         }
2814     }
2815
2816     return out_i;
2817 }
2818
2819 /**
2820  * Split the reference frame list into a reference field list.
2821  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2822  * The input list contains both reference field pairs and
2823  * unmatched reference fields; it is ordered as spec describes
2824  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2825  * unmatched field pairs are also present. Conceptually this is equivalent
2826  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2827  *
2828  * @param dest output reference list where ordered fields are to be placed
2829  * @param dest_len max number of fields to place at dest
2830  * @param src source reference list, as described above
2831  * @param src_len number of pictures (pairs and unmatched fields) in src
2832  * @param parity parity of field being currently decoded
2833  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2834  * @param long_i index into src array that holds first long reference picture,
2835  *        or src_len if no long refs present.
2836  */
2837 static int split_field_ref_list(Picture *dest, int dest_len,
2838                                 Picture *src,  int src_len,
2839                                 int parity,    int long_i){
2840
2841     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2842     dest += i;
2843     dest_len -= i;
2844
2845     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2846                                    src_len - long_i, parity);
2847     return i;
2848 }
2849
2850 /**
2851  * fills the default_ref_list.
2852  */
2853 static int fill_default_ref_list(H264Context *h){
2854     MpegEncContext * const s = &h->s;
2855     int i;
2856     int smallest_poc_greater_than_current = -1;
2857     int structure_sel;
2858     Picture sorted_short_ref[32];
2859     Picture field_entry_list[2][32];
2860     Picture *frame_list[2];
2861
2862     if (FIELD_PICTURE) {
2863         structure_sel = PICT_FRAME;
2864         frame_list[0] = field_entry_list[0];
2865         frame_list[1] = field_entry_list[1];
2866     } else {
2867         structure_sel = 0;
2868         frame_list[0] = h->default_ref_list[0];
2869         frame_list[1] = h->default_ref_list[1];
2870     }
2871
2872     if(h->slice_type_nos==FF_B_TYPE){
2873         int list;
2874         int len[2];
2875         int short_len[2];
2876         int out_i;
2877         int limit= INT_MIN;
2878
2879         /* sort frame according to poc in B slice */
2880         for(out_i=0; out_i<h->short_ref_count; out_i++){
2881             int best_i=INT_MIN;
2882             int best_poc=INT_MAX;
2883
2884             for(i=0; i<h->short_ref_count; i++){
2885                 const int poc= h->short_ref[i]->poc;
2886                 if(poc > limit && poc < best_poc){
2887                     best_poc= poc;
2888                     best_i= i;
2889                 }
2890             }
2891
2892             assert(best_i != INT_MIN);
2893
2894             limit= best_poc;
2895             sorted_short_ref[out_i]= *h->short_ref[best_i];
2896             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2897             if (-1 == smallest_poc_greater_than_current) {
2898                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2899                     smallest_poc_greater_than_current = out_i;
2900                 }
2901             }
2902         }
2903
2904         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2905
2906         // find the largest poc
2907         for(list=0; list<2; list++){
2908             int index = 0;
2909             int j= -99;
2910             int step= list ? -1 : 1;
2911
2912             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2913                 int sel;
2914                 while(j<0 || j>= h->short_ref_count){
2915                     if(j != -99 && step == (list ? -1 : 1))
2916                         return -1;
2917                     step = -step;
2918                     j= smallest_poc_greater_than_current + (step>>1);
2919                 }
2920                 sel = sorted_short_ref[j].reference | structure_sel;
2921                 if(sel != PICT_FRAME) continue;
2922                 frame_list[list][index  ]= sorted_short_ref[j];
2923                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2924             }
2925             short_len[list] = index;
2926
2927             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2928                 int sel;
2929                 if(h->long_ref[i] == NULL) continue;
2930                 sel = h->long_ref[i]->reference | structure_sel;
2931                 if(sel != PICT_FRAME) continue;
2932
2933                 frame_list[ list ][index  ]= *h->long_ref[i];
2934                 frame_list[ list ][index++].pic_id= i;
2935             }
2936             len[list] = index;
2937         }
2938
2939         for(list=0; list<2; list++){
2940             if (FIELD_PICTURE)
2941                 len[list] = split_field_ref_list(h->default_ref_list[list],
2942                                                  h->ref_count[list],
2943                                                  frame_list[list],
2944                                                  len[list],
2945                                                  s->picture_structure,
2946                                                  short_len[list]);
2947
2948             // swap the two first elements of L1 when L0 and L1 are identical
2949             if(list && len[0] > 1 && len[0] == len[1])
2950                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2951                     if(i == len[0]){
2952                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2953                         break;
2954                     }
2955
2956             if(len[list] < h->ref_count[ list ])
2957                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2958         }
2959
2960
2961     }else{
2962         int index=0;
2963         int short_len;
2964         for(i=0; i<h->short_ref_count; i++){
2965             int sel;
2966             sel = h->short_ref[i]->reference | structure_sel;
2967             if(sel != PICT_FRAME) continue;
2968             frame_list[0][index  ]= *h->short_ref[i];
2969             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2970         }
2971         short_len = index;
2972         for(i = 0; i < 16; i++){
2973             int sel;
2974             if(h->long_ref[i] == NULL) continue;
2975             sel = h->long_ref[i]->reference | structure_sel;
2976             if(sel != PICT_FRAME) continue;
2977             frame_list[0][index  ]= *h->long_ref[i];
2978             frame_list[0][index++].pic_id= i;
2979         }
2980
2981         if (FIELD_PICTURE)
2982             index = split_field_ref_list(h->default_ref_list[0],
2983                                          h->ref_count[0], frame_list[0],
2984                                          index, s->picture_structure,
2985                                          short_len);
2986
2987         if(index < h->ref_count[0])
2988             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2989     }
2990 #ifdef TRACE
2991     for (i=0; i<h->ref_count[0]; i++) {
2992         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2993     }
2994     if(h->slice_type_nos==FF_B_TYPE){
2995         for (i=0; i<h->ref_count[1]; i++) {
2996             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2997         }
2998     }
2999 #endif
3000     return 0;
3001 }
3002
3003 static void print_short_term(H264Context *h);
3004 static void print_long_term(H264Context *h);
3005
3006 /**
3007  * Extract structure information about the picture described by pic_num in
3008  * the current decoding context (frame or field). Note that pic_num is
3009  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3010  * @param pic_num picture number for which to extract structure information
3011  * @param structure one of PICT_XXX describing structure of picture
3012  *                      with pic_num
3013  * @return frame number (short term) or long term index of picture
3014  *         described by pic_num
3015  */
3016 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3017     MpegEncContext * const s = &h->s;
3018
3019     *structure = s->picture_structure;
3020     if(FIELD_PICTURE){
3021         if (!(pic_num & 1))
3022             /* opposite field */
3023             *structure ^= PICT_FRAME;
3024         pic_num >>= 1;
3025     }
3026
3027     return pic_num;
3028 }
3029
3030 static int decode_ref_pic_list_reordering(H264Context *h){
3031     MpegEncContext * const s = &h->s;
3032     int list, index, pic_structure;
3033
3034     print_short_term(h);
3035     print_long_term(h);
3036     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before func
3037
3038     for(list=0; list<h->list_count; list++){
3039         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3040
3041         if(get_bits1(&s->gb)){
3042             int pred= h->curr_pic_num;
3043
3044             for(index=0; ; index++){
3045                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3046                 unsigned int pic_id;
3047                 int i;
3048                 Picture *ref = NULL;
3049
3050                 if(reordering_of_pic_nums_idc==3)
3051                     break;
3052
3053                 if(index >= h->ref_count[list]){
3054                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3055                     return -1;
3056                 }
3057
3058                 if(reordering_of_pic_nums_idc<3){
3059                     if(reordering_of_pic_nums_idc<2){
3060                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3061                         int frame_num;
3062
3063                         if(abs_diff_pic_num > h->max_pic_num){
3064                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3065                             return -1;
3066                         }
3067
3068                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3069                         else                                pred+= abs_diff_pic_num;
3070                         pred &= h->max_pic_num - 1;
3071
3072                         frame_num = pic_num_extract(h, pred, &pic_structure);
3073
3074                         for(i= h->short_ref_count-1; i>=0; i--){
3075                             ref = h->short_ref[i];
3076                             assert(ref->reference);
3077                             assert(!ref->long_ref);
3078                             if(ref->data[0] != NULL &&
3079                                    ref->frame_num == frame_num &&
3080                                    (ref->reference & pic_structure) &&
3081                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3082                                 break;
3083                         }
3084                         if(i>=0)
3085                             ref->pic_id= pred;
3086                     }else{
3087                         int long_idx;
3088                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3089
3090                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3091
3092                         if(long_idx>31){
3093                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3094                             return -1;
3095                         }
3096                         ref = h->long_ref[long_idx];
3097                         assert(!(ref && !ref->reference));
3098                         if(ref && (ref->reference & pic_structure)){
3099                             ref->pic_id= pic_id;
3100                             assert(ref->long_ref);
3101                             i=0;
3102                         }else{
3103                             i=-1;
3104                         }
3105                     }
3106
3107                     if (i < 0) {
3108                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3109                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3110                     } else {
3111                         for(i=index; i+1<h->ref_count[list]; i++){
3112                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3113                                 break;
3114                         }
3115                         for(; i > index; i--){
3116                             h->ref_list[list][i]= h->ref_list[list][i-1];
3117                         }
3118                         h->ref_list[list][index]= *ref;
3119                         if (FIELD_PICTURE){
3120                             pic_as_field(&h->ref_list[list][index], pic_structure);
3121                         }
3122                     }
3123                 }else{
3124                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3125                     return -1;
3126                 }
3127             }
3128         }
3129     }
3130     for(list=0; list<h->list_count; list++){
3131         for(index= 0; index < h->ref_count[list]; index++){
3132             if(!h->ref_list[list][index].data[0])
3133                 h->ref_list[list][index]= s->current_picture;
3134         }
3135     }
3136
3137     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3138         direct_dist_scale_factor(h);
3139     direct_ref_list_init(h);
3140     return 0;
3141 }
3142
3143 static void fill_mbaff_ref_list(H264Context *h){
3144     int list, i, j;
3145     for(list=0; list<2; list++){ //FIXME try list_count
3146         for(i=0; i<h->ref_count[list]; i++){
3147             Picture *frame = &h->ref_list[list][i];
3148             Picture *field = &h->ref_list[list][16+2*i];
3149             field[0] = *frame;
3150             for(j=0; j<3; j++)
3151                 field[0].linesize[j] <<= 1;
3152             field[0].reference = PICT_TOP_FIELD;
3153             field[1] = field[0];
3154             for(j=0; j<3; j++)
3155                 field[1].data[j] += frame->linesize[j];
3156             field[1].reference = PICT_BOTTOM_FIELD;
3157
3158             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3159             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3160             for(j=0; j<2; j++){
3161                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3162                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3163             }
3164         }
3165     }
3166     for(j=0; j<h->ref_count[1]; j++){
3167         for(i=0; i<h->ref_count[0]; i++)
3168             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3169         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3170         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3171     }
3172 }
3173
3174 static int pred_weight_table(H264Context *h){
3175     MpegEncContext * const s = &h->s;
3176     int list, i;
3177     int luma_def, chroma_def;
3178
3179     h->use_weight= 0;
3180     h->use_weight_chroma= 0;
3181     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3182     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3183     luma_def = 1<<h->luma_log2_weight_denom;
3184     chroma_def = 1<<h->chroma_log2_weight_denom;
3185
3186     for(list=0; list<2; list++){
3187         for(i=0; i<h->ref_count[list]; i++){
3188             int luma_weight_flag, chroma_weight_flag;
3189
3190             luma_weight_flag= get_bits1(&s->gb);
3191             if(luma_weight_flag){
3192                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3193                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3194                 if(   h->luma_weight[list][i] != luma_def
3195                    || h->luma_offset[list][i] != 0)
3196                     h->use_weight= 1;
3197             }else{
3198                 h->luma_weight[list][i]= luma_def;
3199                 h->luma_offset[list][i]= 0;
3200             }
3201
3202             chroma_weight_flag= get_bits1(&s->gb);
3203             if(chroma_weight_flag){
3204                 int j;
3205                 for(j=0; j<2; j++){
3206                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3207                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3208                     if(   h->chroma_weight[list][i][j] != chroma_def
3209                        || h->chroma_offset[list][i][j] != 0)
3210                         h->use_weight_chroma= 1;
3211                 }
3212             }else{
3213                 int j;
3214                 for(j=0; j<2; j++){
3215                     h->chroma_weight[list][i][j]= chroma_def;
3216                     h->chroma_offset[list][i][j]= 0;
3217                 }
3218             }
3219         }
3220         if(h->slice_type_nos != FF_B_TYPE) break;
3221     }
3222     h->use_weight= h->use_weight || h->use_weight_chroma;
3223     return 0;
3224 }
3225
3226 static void implicit_weight_table(H264Context *h){
3227     MpegEncContext * const s = &h->s;
3228     int ref0, ref1;
3229     int cur_poc = s->current_picture_ptr->poc;
3230
3231     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3232        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3233         h->use_weight= 0;
3234         h->use_weight_chroma= 0;
3235         return;
3236     }
3237
3238     h->use_weight= 2;
3239     h->use_weight_chroma= 2;
3240     h->luma_log2_weight_denom= 5;
3241     h->chroma_log2_weight_denom= 5;
3242
3243     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3244         int poc0 = h->ref_list[0][ref0].poc;
3245         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3246             int poc1 = h->ref_list[1][ref1].poc;
3247             int td = av_clip(poc1 - poc0, -128, 127);
3248             if(td){
3249                 int tb = av_clip(cur_poc - poc0, -128, 127);
3250                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3251                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3252                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3253                     h->implicit_weight[ref0][ref1] = 32;
3254                 else
3255                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3256             }else
3257                 h->implicit_weight[ref0][ref1] = 32;
3258         }
3259     }
3260 }
3261
3262 /**
3263  * Mark a picture as no longer needed for reference. The refmask
3264  * argument allows unreferencing of individual fields or the whole frame.
3265  * If the picture becomes entirely unreferenced, but is being held for
3266  * display purposes, it is marked as such.
3267  * @param refmask mask of fields to unreference; the mask is bitwise
3268  *                anded with the reference marking of pic
3269  * @return non-zero if pic becomes entirely unreferenced (except possibly
3270  *         for display purposes) zero if one of the fields remains in
3271  *         reference
3272  */
3273 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3274     int i;
3275     if (pic->reference &= refmask) {
3276         return 0;
3277     } else {
3278         for(i = 0; h->delayed_pic[i]; i++)
3279             if(pic == h->delayed_pic[i]){
3280                 pic->reference=DELAYED_PIC_REF;
3281                 break;
3282             }
3283         return 1;
3284     }
3285 }
3286
3287 /**
3288  * instantaneous decoder refresh.
3289  */
3290 static void idr(H264Context *h){
3291     int i;
3292
3293     for(i=0; i<16; i++){
3294         if (h->long_ref[i] != NULL) {
3295             unreference_pic(h, h->long_ref[i], 0);
3296             h->long_ref[i]= NULL;
3297         }
3298     }
3299     h->long_ref_count=0;
3300
3301     for(i=0; i<h->short_ref_count; i++){
3302         unreference_pic(h, h->short_ref[i], 0);
3303         h->short_ref[i]= NULL;
3304     }
3305     h->short_ref_count=0;
3306 }
3307
3308 /* forget old pics after a seek */
3309 static void flush_dpb(AVCodecContext *avctx){
3310     H264Context *h= avctx->priv_data;
3311     int i;
3312     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3313         if(h->delayed_pic[i])
3314             h->delayed_pic[i]->reference= 0;
3315         h->delayed_pic[i]= NULL;
3316     }
3317     h->outputed_poc= INT_MIN;
3318     idr(h);
3319     if(h->s.current_picture_ptr)
3320         h->s.current_picture_ptr->reference= 0;
3321     h->s.first_field= 0;
3322     ff_mpeg_flush(avctx);
3323 }
3324
3325 /**
3326  * Find a Picture in the short term reference list by frame number.
3327  * @param frame_num frame number to search for
3328  * @param idx the index into h->short_ref where returned picture is found
3329  *            undefined if no picture found.
3330  * @return pointer to the found picture, or NULL if no pic with the provided
3331  *                 frame number is found
3332  */
3333 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3334     MpegEncContext * const s = &h->s;
3335     int i;
3336
3337     for(i=0; i<h->short_ref_count; i++){
3338         Picture *pic= h->short_ref[i];
3339         if(s->avctx->debug&FF_DEBUG_MMCO)
3340             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3341         if(pic->frame_num == frame_num) {
3342             *idx = i;
3343             return pic;
3344         }
3345     }
3346     return NULL;
3347 }
3348
3349 /**
3350  * Remove a picture from the short term reference list by its index in
3351  * that list.  This does no checking on the provided index; it is assumed
3352  * to be valid. Other list entries are shifted down.
3353  * @param i index into h->short_ref of picture to remove.
3354  */
3355 static void remove_short_at_index(H264Context *h, int i){
3356     assert(i >= 0 && i < h->short_ref_count);
3357     h->short_ref[i]= NULL;
3358     if (--h->short_ref_count)
3359         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3360 }
3361
3362 /**
3363  *
3364  * @return the removed picture or NULL if an error occurs
3365  */
3366 static Picture * remove_short(H264Context *h, int frame_num){
3367     MpegEncContext * const s = &h->s;
3368     Picture *pic;
3369     int i;
3370
3371     if(s->avctx->debug&FF_DEBUG_MMCO)
3372         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3373
3374     pic = find_short(h, frame_num, &i);
3375     if (pic)
3376         remove_short_at_index(h, i);
3377
3378     return pic;
3379 }
3380
3381 /**
3382  * Remove a picture from the long term reference list by its index in
3383  * that list.  This does no checking on the provided index; it is assumed
3384  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3385  * @param i index into h->long_ref of picture to remove.
3386  */
3387 static void remove_long_at_index(H264Context *h, int i){
3388     h->long_ref[i]= NULL;
3389     h->long_ref_count--;
3390 }
3391
3392 /**
3393  *
3394  * @return the removed picture or NULL if an error occurs
3395  */
3396 static Picture * remove_long(H264Context *h, int i){
3397     Picture *pic;
3398
3399     pic= h->long_ref[i];
3400     if (pic)
3401         remove_long_at_index(h, i);
3402
3403     return pic;
3404 }
3405
3406 /**
3407  * print short term list
3408  */
3409 static void print_short_term(H264Context *h) {
3410     uint32_t i;
3411     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3412         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3413         for(i=0; i<h->short_ref_count; i++){
3414             Picture *pic= h->short_ref[i];
3415             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3416         }
3417     }
3418 }
3419
3420 /**
3421  * print long term list
3422  */
3423 static void print_long_term(H264Context *h) {
3424     uint32_t i;
3425     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3426         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3427         for(i = 0; i < 16; i++){
3428             Picture *pic= h->long_ref[i];
3429             if (pic) {
3430                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3431             }
3432         }
3433     }
3434 }
3435
3436 /**
3437  * Executes the reference picture marking (memory management control operations).
3438  */
3439 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3440     MpegEncContext * const s = &h->s;
3441     int i, j;
3442     int current_ref_assigned=0;
3443     Picture *pic;
3444
3445     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3446         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3447
3448     for(i=0; i<mmco_count; i++){
3449         int structure, frame_num, unref_pic;
3450         if(s->avctx->debug&FF_DEBUG_MMCO)
3451             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3452
3453         switch(mmco[i].opcode){
3454         case MMCO_SHORT2UNUSED:
3455             if(s->avctx->debug&FF_DEBUG_MMCO)
3456                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3457             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3458             pic = find_short(h, frame_num, &j);
3459             if (pic) {
3460                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3461                     remove_short_at_index(h, j);
3462             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3463                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3464             break;
3465         case MMCO_SHORT2LONG:
3466             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3467                     h->long_ref[mmco[i].long_arg]->frame_num ==
3468                                               mmco[i].short_pic_num / 2) {
3469                 /* do nothing, we've already moved this field pair. */
3470             } else {
3471                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3472
3473                 pic= remove_long(h, mmco[i].long_arg);
3474                 if(pic) unreference_pic(h, pic, 0);
3475
3476                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3477                 if (h->long_ref[ mmco[i].long_arg ]){
3478                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3479                     h->long_ref_count++;
3480                 }
3481             }
3482             break;
3483         case MMCO_LONG2UNUSED:
3484             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3485             pic = h->long_ref[j];
3486             if (pic) {
3487                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3488                     remove_long_at_index(h, j);
3489             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3490                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3491             break;
3492         case MMCO_LONG:
3493             unref_pic = 1;
3494             if (FIELD_PICTURE && !s->first_field) {
3495                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3496                     /* Just mark second field as referenced */
3497                     unref_pic = 0;
3498                 } else if (s->current_picture_ptr->reference) {
3499                     /* First field in pair is in short term list or
3500                      * at a different long term index.
3501                      * This is not allowed; see 7.4.3, notes 2 and 3.
3502                      * Report the problem and keep the pair where it is,
3503                      * and mark this field valid.
3504                      */
3505                     av_log(h->s.avctx, AV_LOG_ERROR,
3506                         "illegal long term reference assignment for second "
3507                         "field in complementary field pair (first field is "
3508                         "short term or has non-matching long index)\n");
3509                     unref_pic = 0;
3510                 }
3511             }
3512
3513             if (unref_pic) {
3514                 pic= remove_long(h, mmco[i].long_arg);
3515                 if(pic) unreference_pic(h, pic, 0);
3516
3517                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3518                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3519                 h->long_ref_count++;
3520             }
3521
3522             s->current_picture_ptr->reference |= s->picture_structure;
3523             current_ref_assigned=1;
3524             break;
3525         case MMCO_SET_MAX_LONG:
3526             assert(mmco[i].long_arg <= 16);
3527             // just remove the long term which index is greater than new max
3528             for(j = mmco[i].long_arg; j<16; j++){
3529                 pic = remove_long(h, j);
3530                 if (pic) unreference_pic(h, pic, 0);
3531             }
3532             break;
3533         case MMCO_RESET:
3534             while(h->short_ref_count){
3535                 pic= remove_short(h, h->short_ref[0]->frame_num);
3536                 if(pic) unreference_pic(h, pic, 0);
3537             }
3538             for(j = 0; j < 16; j++) {
3539                 pic= remove_long(h, j);
3540                 if(pic) unreference_pic(h, pic, 0);
3541             }
3542             break;
3543         default: assert(0);
3544         }
3545     }
3546
3547     if (!current_ref_assigned && FIELD_PICTURE &&
3548             !s->first_field && s->current_picture_ptr->reference) {
3549
3550         /* Second field of complementary field pair; the first field of
3551          * which is already referenced. If short referenced, it
3552          * should be first entry in short_ref. If not, it must exist
3553          * in long_ref; trying to put it on the short list here is an
3554          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3555          */
3556         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3557             /* Just mark the second field valid */
3558             s->current_picture_ptr->reference = PICT_FRAME;
3559         } else if (s->current_picture_ptr->long_ref) {
3560             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3561                                              "assignment for second field "
3562                                              "in complementary field pair "
3563                                              "(first field is long term)\n");
3564         } else {
3565             /*
3566              * First field in reference, but not in any sensible place on our
3567              * reference lists. This shouldn't happen unless reference
3568              * handling somewhere else is wrong.
3569              */
3570             assert(0);
3571         }
3572         current_ref_assigned = 1;
3573     }
3574
3575     if(!current_ref_assigned){
3576         pic= remove_short(h, s->current_picture_ptr->frame_num);
3577         if(pic){
3578             unreference_pic(h, pic, 0);
3579             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3580         }
3581
3582         if(h->short_ref_count)
3583             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3584
3585         h->short_ref[0]= s->current_picture_ptr;
3586         h->short_ref[0]->long_ref=0;
3587         h->short_ref_count++;
3588         s->current_picture_ptr->reference |= s->picture_structure;
3589     }
3590
3591     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3592
3593         /* We have too many reference frames, probably due to corrupted
3594          * stream. Need to discard one frame. Prevents overrun of the
3595          * short_ref and long_ref buffers.
3596          */
3597         av_log(h->s.avctx, AV_LOG_ERROR,
3598                "number of reference frames exceeds max (probably "
3599                "corrupt input), discarding one\n");
3600
3601         if (h->long_ref_count) {
3602             for (i = 0; i < 16; ++i)
3603                 if (h->long_ref[i])
3604                     break;
3605
3606             assert(i < 16);
3607             pic = h->long_ref[i];
3608             remove_long_at_index(h, i);
3609         } else {
3610             pic = h->short_ref[h->short_ref_count - 1];
3611             remove_short_at_index(h, h->short_ref_count - 1);
3612         }
3613         unreference_pic(h, pic, 0);
3614     }
3615
3616     print_short_term(h);
3617     print_long_term(h);
3618     return 0;
3619 }
3620
3621 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3622     MpegEncContext * const s = &h->s;
3623     int i;
3624
3625     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3626         s->broken_link= get_bits1(gb) -1;
3627         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3628         if(h->mmco[0].long_arg == -1)
3629             h->mmco_index= 0;
3630         else{
3631             h->mmco[0].opcode= MMCO_LONG;
3632             h->mmco_index= 1;
3633         }
3634     }else{
3635         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3636             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3637                 MMCOOpcode opcode= get_ue_golomb(gb);
3638
3639                 h->mmco[i].opcode= opcode;
3640                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3641                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3642 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3643                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3644                         return -1;
3645                     }*/
3646                 }
3647                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3648                     unsigned int long_arg= get_ue_golomb(gb);
3649                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3650                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3651                         return -1;
3652                     }
3653                     h->mmco[i].long_arg= long_arg;
3654                 }
3655
3656                 if(opcode > (unsigned)MMCO_LONG){
3657                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3658                     return -1;
3659                 }
3660                 if(opcode == MMCO_END)
3661                     break;
3662             }
3663             h->mmco_index= i;
3664         }else{
3665             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3666
3667             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3668                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3669                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3670                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3671                 h->mmco_index= 1;
3672                 if (FIELD_PICTURE) {
3673                     h->mmco[0].short_pic_num *= 2;
3674                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3675                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3676                     h->mmco_index= 2;
3677                 }
3678             }else
3679                 h->mmco_index= 0;
3680         }
3681     }
3682
3683     return 0;
3684 }
3685
3686 static int init_poc(H264Context *h){
3687     MpegEncContext * const s = &h->s;
3688     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3689     int field_poc[2];
3690
3691     if(h->nal_unit_type == NAL_IDR_SLICE){
3692         h->frame_num_offset= 0;
3693     }else{
3694         if(h->frame_num < h->prev_frame_num)
3695             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3696         else
3697             h->frame_num_offset= h->prev_frame_num_offset;
3698     }
3699
3700     if(h->sps.poc_type==0){
3701         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3702
3703         if(h->nal_unit_type == NAL_IDR_SLICE){
3704              h->prev_poc_msb=
3705              h->prev_poc_lsb= 0;
3706         }
3707
3708         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3709             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3710         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3711             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3712         else
3713             h->poc_msb = h->prev_poc_msb;
3714 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3715         field_poc[0] =
3716         field_poc[1] = h->poc_msb + h->poc_lsb;
3717         if(s->picture_structure == PICT_FRAME)
3718             field_poc[1] += h->delta_poc_bottom;
3719     }else if(h->sps.poc_type==1){
3720         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3721         int i;
3722
3723         if(h->sps.poc_cycle_length != 0)
3724             abs_frame_num = h->frame_num_offset + h->frame_num;
3725         else
3726             abs_frame_num = 0;
3727
3728         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3729             abs_frame_num--;
3730
3731         expected_delta_per_poc_cycle = 0;
3732         for(i=0; i < h->sps.poc_cycle_length; i++)
3733             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3734
3735         if(abs_frame_num > 0){
3736             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3737             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3738
3739             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3740             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3741                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3742         } else
3743             expectedpoc = 0;
3744
3745         if(h->nal_ref_idc == 0)
3746             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3747
3748         field_poc[0] = expectedpoc + h->delta_poc[0];
3749         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3750
3751         if(s->picture_structure == PICT_FRAME)
3752             field_poc[1] += h->delta_poc[1];
3753     }else{
3754         int poc;
3755         if(h->nal_unit_type == NAL_IDR_SLICE){
3756             poc= 0;
3757         }else{
3758             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3759             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3760         }
3761         field_poc[0]= poc;
3762         field_poc[1]= poc;
3763     }
3764
3765     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3766         s->current_picture_ptr->field_poc[0]= field_poc[0];
3767         s->current_picture_ptr->poc = field_poc[0];
3768     }
3769     if(s->picture_structure != PICT_TOP_FIELD) {
3770         s->current_picture_ptr->field_poc[1]= field_poc[1];
3771         s->current_picture_ptr->poc = field_poc[1];
3772     }
3773     if(!FIELD_PICTURE || !s->first_field) {
3774         Picture *cur = s->current_picture_ptr;
3775         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3776     }
3777
3778     return 0;
3779 }
3780
3781
3782 /**
3783  * initialize scan tables
3784  */
3785 static void init_scan_tables(H264Context *h){
3786     MpegEncContext * const s = &h->s;
3787     int i;
3788     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3789         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3790         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3791     }else{
3792         for(i=0; i<16; i++){
3793 #define T(x) (x>>2) | ((x<<2) & 0xF)
3794             h->zigzag_scan[i] = T(zigzag_scan[i]);
3795             h-> field_scan[i] = T( field_scan[i]);
3796 #undef T
3797         }
3798     }
3799     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3800         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3801         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3802         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3803         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3804     }else{
3805         for(i=0; i<64; i++){
3806 #define T(x) (x>>3) | ((x&7)<<3)
3807             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3808             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3809             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3810             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3811 #undef T
3812         }
3813     }
3814     if(h->sps.transform_bypass){ //FIXME same ugly
3815         h->zigzag_scan_q0          = zigzag_scan;
3816         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3817         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3818         h->field_scan_q0           = field_scan;
3819         h->field_scan8x8_q0        = field_scan8x8;
3820         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3821     }else{
3822         h->zigzag_scan_q0          = h->zigzag_scan;
3823         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3824         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3825         h->field_scan_q0           = h->field_scan;
3826         h->field_scan8x8_q0        = h->field_scan8x8;
3827         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3828     }
3829 }
3830
3831 /**
3832  * Replicates H264 "master" context to thread contexts.
3833  */
3834 static void clone_slice(H264Context *dst, H264Context *src)
3835 {
3836     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3837     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3838     dst->s.current_picture      = src->s.current_picture;
3839     dst->s.linesize             = src->s.linesize;
3840     dst->s.uvlinesize           = src->s.uvlinesize;
3841     dst->s.first_field          = src->s.first_field;
3842
3843     dst->prev_poc_msb           = src->prev_poc_msb;
3844     dst->prev_poc_lsb           = src->prev_poc_lsb;
3845     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3846     dst->prev_frame_num         = src->prev_frame_num;
3847     dst->short_ref_count        = src->short_ref_count;
3848
3849     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3850     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3851     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3852     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3853
3854     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3855     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3856 }
3857
3858 /**
3859  * decodes a slice header.
3860  * This will also call MPV_common_init() and frame_start() as needed.
3861  *
3862  * @param h h264context
3863  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3864  *
3865  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3866  */
3867 static int decode_slice_header(H264Context *h, H264Context *h0){
3868     MpegEncContext * const s = &h->s;
3869     MpegEncContext * const s0 = &h0->s;
3870     unsigned int first_mb_in_slice;
3871     unsigned int pps_id;
3872     int num_ref_idx_active_override_flag;
3873     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3874     unsigned int slice_type, tmp, i, j;
3875     int default_ref_list_done = 0;
3876     int last_pic_structure;
3877
3878     s->dropable= h->nal_ref_idc == 0;
3879
3880     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3881         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3882         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3883     }else{
3884         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3885         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3886     }
3887
3888     first_mb_in_slice= get_ue_golomb(&s->gb);
3889
3890     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3891         h0->current_slice = 0;
3892         if (!s0->first_field)
3893             s->current_picture_ptr= NULL;
3894     }
3895
3896     slice_type= get_ue_golomb(&s->gb);
3897     if(slice_type > 9){
3898         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3899         return -1;
3900     }
3901     if(slice_type > 4){
3902         slice_type -= 5;
3903         h->slice_type_fixed=1;
3904     }else
3905         h->slice_type_fixed=0;
3906
3907     slice_type= slice_type_map[ slice_type ];
3908     if (slice_type == FF_I_TYPE
3909         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3910         default_ref_list_done = 1;
3911     }
3912     h->slice_type= slice_type;
3913     h->slice_type_nos= slice_type & 3;
3914
3915     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3916     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3917         av_log(h->s.avctx, AV_LOG_ERROR,
3918                "B picture before any references, skipping\n");
3919         return -1;
3920     }
3921
3922     pps_id= get_ue_golomb(&s->gb);
3923     if(pps_id>=MAX_PPS_COUNT){
3924         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3925         return -1;
3926     }
3927     if(!h0->pps_buffers[pps_id]) {
3928         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3929         return -1;
3930     }
3931     h->pps= *h0->pps_buffers[pps_id];
3932
3933     if(!h0->sps_buffers[h->pps.sps_id]) {
3934         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3935         return -1;
3936     }
3937     h->sps = *h0->sps_buffers[h->pps.sps_id];
3938
3939     if(h == h0 && h->dequant_coeff_pps != pps_id){
3940         h->dequant_coeff_pps = pps_id;
3941         init_dequant_tables(h);
3942     }
3943
3944     s->mb_width= h->sps.mb_width;
3945     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3946
3947     h->b_stride=  s->mb_width*4;
3948     h->b8_stride= s->mb_width*2;
3949
3950     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3951     if(h->sps.frame_mbs_only_flag)
3952         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3953     else
3954         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3955
3956     if (s->context_initialized
3957         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3958         if(h != h0)
3959             return -1;   // width / height changed during parallelized decoding
3960         free_tables(h);
3961         MPV_common_end(s);
3962     }
3963     if (!s->context_initialized) {
3964         if(h != h0)
3965             return -1;  // we cant (re-)initialize context during parallel decoding
3966         if (MPV_common_init(s) < 0)
3967             return -1;
3968         s->first_field = 0;
3969
3970         init_scan_tables(h);
3971         alloc_tables(h);
3972
3973         for(i = 1; i < s->avctx->thread_count; i++) {
3974             H264Context *c;
3975             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3976             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3977             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3978             c->sps = h->sps;
3979             c->pps = h->pps;
3980             init_scan_tables(c);
3981             clone_tables(c, h);
3982         }
3983
3984         for(i = 0; i < s->avctx->thread_count; i++)
3985             if(context_init(h->thread_context[i]) < 0)
3986                 return -1;
3987
3988         s->avctx->width = s->width;
3989         s->avctx->height = s->height;
3990         s->avctx->sample_aspect_ratio= h->sps.sar;
3991         if(!s->avctx->sample_aspect_ratio.den)
3992             s->avctx->sample_aspect_ratio.den = 1;
3993
3994         if(h->sps.timing_info_present_flag){
3995             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3996             if(h->x264_build > 0 && h->x264_build < 44)
3997                 s->avctx->time_base.den *= 2;
3998             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3999                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4000         }
4001     }
4002
4003     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4004
4005     h->mb_mbaff = 0;
4006     h->mb_aff_frame = 0;
4007     last_pic_structure = s0->picture_structure;
4008     if(h->sps.frame_mbs_only_flag){
4009         s->picture_structure= PICT_FRAME;
4010     }else{
4011         if(get_bits1(&s->gb)) { //field_pic_flag
4012             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4013         } else {
4014             s->picture_structure= PICT_FRAME;
4015             h->mb_aff_frame = h->sps.mb_aff;
4016         }
4017     }
4018
4019     if(h0->current_slice == 0){
4020         /* See if we have a decoded first field looking for a pair... */
4021         if (s0->first_field) {
4022             assert(s0->current_picture_ptr);
4023             assert(s0->current_picture_ptr->data[0]);
4024             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4025
4026             /* figure out if we have a complementary field pair */
4027             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4028                 /*
4029                  * Previous field is unmatched. Don't display it, but let it
4030                  * remain for reference if marked as such.
4031                  */
4032                 s0->current_picture_ptr = NULL;
4033                 s0->first_field = FIELD_PICTURE;
4034
4035             } else {
4036                 if (h->nal_ref_idc &&
4037                         s0->current_picture_ptr->reference &&
4038                         s0->current_picture_ptr->frame_num != h->frame_num) {
4039                     /*
4040                      * This and previous field were reference, but had
4041                      * different frame_nums. Consider this field first in
4042                      * pair. Throw away previous field except for reference
4043                      * purposes.
4044                      */
4045                     s0->first_field = 1;
4046                     s0->current_picture_ptr = NULL;
4047
4048                 } else {
4049                     /* Second field in complementary pair */
4050                     s0->first_field = 0;
4051                 }
4052             }
4053
4054         } else {
4055             /* Frame or first field in a potentially complementary pair */
4056             assert(!s0->current_picture_ptr);
4057             s0->first_field = FIELD_PICTURE;
4058         }
4059
4060         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4061             s0->first_field = 0;
4062             return -1;
4063         }
4064     }
4065     if(h != h0)
4066         clone_slice(h, h0);
4067
4068     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4069
4070     assert(s->mb_num == s->mb_width * s->mb_height);
4071     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4072        first_mb_in_slice                    >= s->mb_num){
4073         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4074         return -1;
4075     }
4076     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4077     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4078     if (s->picture_structure == PICT_BOTTOM_FIELD)
4079         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4080     assert(s->mb_y < s->mb_height);
4081
4082     if(s->picture_structure==PICT_FRAME){
4083         h->curr_pic_num=   h->frame_num;
4084         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4085     }else{
4086         h->curr_pic_num= 2*h->frame_num + 1;
4087         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4088     }
4089
4090     if(h->nal_unit_type == NAL_IDR_SLICE){
4091         get_ue_golomb(&s->gb); /* idr_pic_id */
4092     }
4093
4094     if(h->sps.poc_type==0){
4095         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4096
4097         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4098             h->delta_poc_bottom= get_se_golomb(&s->gb);
4099         }
4100     }
4101
4102     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4103         h->delta_poc[0]= get_se_golomb(&s->gb);
4104
4105         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4106             h->delta_poc[1]= get_se_golomb(&s->gb);
4107     }
4108
4109     init_poc(h);
4110
4111     if(h->pps.redundant_pic_cnt_present){
4112         h->redundant_pic_count= get_ue_golomb(&s->gb);
4113     }
4114
4115     //set defaults, might be overriden a few line later
4116     h->ref_count[0]= h->pps.ref_count[0];
4117     h->ref_count[1]= h->pps.ref_count[1];
4118
4119     if(h->slice_type_nos != FF_I_TYPE){
4120         if(h->slice_type_nos == FF_B_TYPE){
4121             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4122         }
4123         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4124
4125         if(num_ref_idx_active_override_flag){
4126             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4127             if(h->slice_type_nos==FF_B_TYPE)
4128                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4129
4130             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4131                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4132                 h->ref_count[0]= h->ref_count[1]= 1;
4133                 return -1;
4134             }
4135         }
4136         if(h->slice_type_nos == FF_B_TYPE)
4137             h->list_count= 2;
4138         else
4139             h->list_count= 1;
4140     }else
4141         h->list_count= 0;
4142
4143     if(!default_ref_list_done){
4144         fill_default_ref_list(h);
4145     }
4146
4147     if(decode_ref_pic_list_reordering(h) < 0)
4148         return -1;
4149
4150     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4151        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4152         pred_weight_table(h);
4153     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4154         implicit_weight_table(h);
4155     else
4156         h->use_weight = 0;
4157
4158     if(h->nal_ref_idc)
4159         decode_ref_pic_marking(h0, &s->gb);
4160
4161     if(FRAME_MBAFF)
4162         fill_mbaff_ref_list(h);
4163
4164     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4165         tmp = get_ue_golomb(&s->gb);
4166         if(tmp > 2){
4167             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4168             return -1;
4169         }
4170         h->cabac_init_idc= tmp;
4171     }
4172
4173     h->last_qscale_diff = 0;
4174     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4175     if(tmp>51){
4176         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4177         return -1;
4178     }
4179     s->qscale= tmp;
4180     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4181     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4182     //FIXME qscale / qp ... stuff
4183     if(h->slice_type == FF_SP_TYPE){
4184         get_bits1(&s->gb); /* sp_for_switch_flag */
4185     }
4186     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4187         get_se_golomb(&s->gb); /* slice_qs_delta */
4188     }
4189
4190     h->deblocking_filter = 1;
4191     h->slice_alpha_c0_offset = 0;
4192     h->slice_beta_offset = 0;
4193     if( h->pps.deblocking_filter_parameters_present ) {
4194         tmp= get_ue_golomb(&s->gb);
4195         if(tmp > 2){
4196             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4197             return -1;
4198         }
4199         h->deblocking_filter= tmp;
4200         if(h->deblocking_filter < 2)
4201             h->deblocking_filter^= 1; // 1<->0
4202
4203         if( h->deblocking_filter ) {
4204             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4205             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4206         }
4207     }
4208
4209     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4210        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4211        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4212        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4213         h->deblocking_filter= 0;
4214
4215     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4216         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4217             /* Cheat slightly for speed:
4218                Do not bother to deblock across slices. */
4219             h->deblocking_filter = 2;
4220         } else {
4221             h0->max_contexts = 1;
4222             if(!h0->single_decode_warning) {
4223                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4224                 h0->single_decode_warning = 1;
4225             }
4226             if(h != h0)
4227                 return 1; // deblocking switched inside frame
4228         }
4229     }
4230
4231 #if 0 //FMO
4232     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4233         slice_group_change_cycle= get_bits(&s->gb, ?);
4234 #endif
4235
4236     h0->last_slice_type = slice_type;
4237     h->slice_num = ++h0->current_slice;
4238
4239     for(j=0; j<2; j++){
4240         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4241         ref2frm[0]=
4242         ref2frm[1]= -1;
4243         for(i=0; i<48; i++)
4244             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4245                           +(h->ref_list[j][i].reference&3);
4246     }
4247
4248     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4249     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4250
4251     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4252         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4253                h->slice_num,
4254                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4255                first_mb_in_slice,
4256                av_get_pict_type_char(h->slice_type),
4257                pps_id, h->frame_num,
4258                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4259                h->ref_count[0], h->ref_count[1],
4260                s->qscale,
4261                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4262                h->use_weight,
4263                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4264                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4265                );
4266     }
4267
4268     return 0;
4269 }
4270
4271 /**
4272  *
4273  */
4274 static inline int get_level_prefix(GetBitContext *gb){
4275     unsigned int buf;
4276     int log;
4277
4278     OPEN_READER(re, gb);
4279     UPDATE_CACHE(re, gb);
4280     buf=GET_CACHE(re, gb);
4281
4282     log= 32 - av_log2(buf);
4283 #ifdef TRACE
4284     print_bin(buf>>(32-log), log);
4285     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4286 #endif
4287
4288     LAST_SKIP_BITS(re, gb, log);
4289     CLOSE_READER(re, gb);
4290
4291     return log-1;
4292 }
4293
4294 static inline int get_dct8x8_allowed(H264Context *h){
4295     int i;
4296     for(i=0; i<4; i++){
4297         if(!IS_SUB_8X8(h->sub_mb_type[i])
4298            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4299             return 0;
4300     }
4301     return 1;
4302 }
4303
4304 /**
4305  * decodes a residual block.
4306  * @param n block index
4307  * @param scantable scantable
4308  * @param max_coeff number of coefficients in the block
4309  * @return <0 if an error occurred
4310  */
4311 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4312     MpegEncContext * const s = &h->s;
4313     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4314     int level[16];
4315     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4316
4317     //FIXME put trailing_onex into the context
4318
4319     if(n == CHROMA_DC_BLOCK_INDEX){
4320         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4321         total_coeff= coeff_token>>2;
4322     }else{
4323         if(n == LUMA_DC_BLOCK_INDEX){
4324             total_coeff= pred_non_zero_count(h, 0);
4325             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4326             total_coeff= coeff_token>>2;
4327         }else{
4328             total_coeff= pred_non_zero_count(h, n);
4329             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4330             total_coeff= coeff_token>>2;
4331             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4332         }
4333     }
4334
4335     //FIXME set last_non_zero?
4336
4337     if(total_coeff==0)
4338         return 0;
4339     if(total_coeff > (unsigned)max_coeff) {
4340         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4341         return -1;
4342     }
4343
4344     trailing_ones= coeff_token&3;
4345     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4346     assert(total_coeff<=16);
4347
4348     for(i=0; i<trailing_ones; i++){
4349         level[i]= 1 - 2*get_bits1(gb);
4350     }
4351
4352     if(i<total_coeff) {
4353         int level_code, mask;
4354         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4355         int prefix= get_level_prefix(gb);
4356
4357         //first coefficient has suffix_length equal to 0 or 1
4358         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4359             if(suffix_length)
4360                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4361             else
4362                 level_code= (prefix<<suffix_length); //part
4363         }else if(prefix==14){
4364             if(suffix_length)
4365                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4366             else
4367                 level_code= prefix + get_bits(gb, 4); //part
4368         }else{
4369             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4370             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4371             if(prefix>=16)
4372                 level_code += (1<<(prefix-3))-4096;
4373         }
4374
4375         if(trailing_ones < 3) level_code += 2;
4376
4377         suffix_length = 1;
4378         if(level_code > 5)
4379             suffix_length++;
4380         mask= -(level_code&1);
4381         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4382         i++;
4383
4384         //remaining coefficients have suffix_length > 0
4385         for(;i<total_coeff;i++) {
4386             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4387             prefix = get_level_prefix(gb);
4388             if(prefix<15){
4389                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4390             }else{
4391                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4392                 if(prefix>=16)
4393                     level_code += (1<<(prefix-3))-4096;
4394             }
4395             mask= -(level_code&1);
4396             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4397             if(level_code > suffix_limit[suffix_length])
4398                 suffix_length++;
4399         }
4400     }
4401
4402     if(total_coeff == max_coeff)
4403         zeros_left=0;
4404     else{
4405         if(n == CHROMA_DC_BLOCK_INDEX)
4406             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4407         else
4408             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4409     }
4410
4411     coeff_num = zeros_left + total_coeff - 1;
4412     j = scantable[coeff_num];
4413     if(n > 24){
4414         block[j] = level[0];
4415         for(i=1;i<total_coeff;i++) {
4416             if(zeros_left <= 0)
4417                 run_before = 0;
4418             else if(zeros_left < 7){
4419                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4420             }else{
4421                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4422             }
4423             zeros_left -= run_before;
4424             coeff_num -= 1 + run_before;
4425             j= scantable[ coeff_num ];
4426
4427             block[j]= level[i];
4428         }
4429     }else{
4430         block[j] = (level[0] * qmul[j] + 32)>>6;
4431         for(i=1;i<total_coeff;i++) {
4432             if(zeros_left <= 0)
4433                 run_before = 0;
4434             else if(zeros_left < 7){
4435                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4436             }else{
4437                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4438             }
4439             zeros_left -= run_before;
4440             coeff_num -= 1 + run_before;
4441             j= scantable[ coeff_num ];
4442
4443             block[j]= (level[i] * qmul[j] + 32)>>6;
4444         }
4445     }
4446
4447     if(zeros_left<0){
4448         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4449         return -1;
4450     }
4451
4452     return 0;
4453 }
4454
4455 static void predict_field_decoding_flag(H264Context *h){
4456     MpegEncContext * const s = &h->s;
4457     const int mb_xy= h->mb_xy;
4458     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4459                 ? s->current_picture.mb_type[mb_xy-1]
4460                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4461                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4462                 : 0;
4463     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4464 }
4465
4466 /**
4467  * decodes a P_SKIP or B_SKIP macroblock
4468  */
4469 static void decode_mb_skip(H264Context *h){
4470     MpegEncContext * const s = &h->s;
4471     const int mb_xy= h->mb_xy;
4472     int mb_type=0;
4473
4474     memset(h->non_zero_count[mb_xy], 0, 16);
4475     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4476
4477     if(MB_FIELD)
4478         mb_type|= MB_TYPE_INTERLACED;
4479
4480     if( h->slice_type_nos == FF_B_TYPE )
4481     {
4482         // just for fill_caches. pred_direct_motion will set the real mb_type
4483         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4484
4485         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4486         pred_direct_motion(h, &mb_type);
4487         mb_type|= MB_TYPE_SKIP;
4488     }
4489     else
4490     {
4491         int mx, my;
4492         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4493
4494         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4495         pred_pskip_motion(h, &mx, &my);
4496         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4497         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4498     }
4499
4500     write_back_motion(h, mb_type);
4501     s->current_picture.mb_type[mb_xy]= mb_type;
4502     s->current_picture.qscale_table[mb_xy]= s->qscale;
4503     h->slice_table[ mb_xy ]= h->slice_num;
4504     h->prev_mb_skipped= 1;
4505 }
4506
4507 /**
4508  * decodes a macroblock
4509  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4510  */
4511 static int decode_mb_cavlc(H264Context *h){
4512     MpegEncContext * const s = &h->s;
4513     int mb_xy;
4514     int partition_count;
4515     unsigned int mb_type, cbp;
4516     int dct8x8_allowed= h->pps.transform_8x8_mode;
4517
4518     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4519
4520     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4521
4522     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4523     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4524                 down the code */
4525     if(h->slice_type_nos != FF_I_TYPE){
4526         if(s->mb_skip_run==-1)
4527             s->mb_skip_run= get_ue_golomb(&s->gb);
4528
4529         if (s->mb_skip_run--) {
4530             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4531                 if(s->mb_skip_run==0)
4532                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4533                 else
4534                     predict_field_decoding_flag(h);
4535             }
4536             decode_mb_skip(h);
4537             return 0;
4538         }
4539     }
4540     if(FRAME_MBAFF){
4541         if( (s->mb_y&1) == 0 )
4542             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4543     }else
4544         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4545
4546     h->prev_mb_skipped= 0;
4547
4548     mb_type= get_ue_golomb(&s->gb);
4549     if(h->slice_type_nos == FF_B_TYPE){
4550         if(mb_type < 23){
4551             partition_count= b_mb_type_info[mb_type].partition_count;
4552             mb_type=         b_mb_type_info[mb_type].type;
4553         }else{
4554             mb_type -= 23;
4555             goto decode_intra_mb;
4556         }
4557     }else if(h->slice_type_nos == FF_P_TYPE){
4558         if(mb_type < 5){
4559             partition_count= p_mb_type_info[mb_type].partition_count;
4560             mb_type=         p_mb_type_info[mb_type].type;
4561         }else{
4562             mb_type -= 5;
4563             goto decode_intra_mb;
4564         }
4565     }else{
4566        assert(h->slice_type_nos == FF_I_TYPE);
4567         if(h->slice_type == FF_SI_TYPE && mb_type)
4568             mb_type--;
4569 decode_intra_mb:
4570         if(mb_type > 25){
4571             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4572             return -1;
4573         }
4574         partition_count=0;
4575         cbp= i_mb_type_info[mb_type].cbp;
4576         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4577         mb_type= i_mb_type_info[mb_type].type;
4578     }
4579
4580     if(MB_FIELD)
4581         mb_type |= MB_TYPE_INTERLACED;
4582
4583     h->slice_table[ mb_xy ]= h->slice_num;
4584
4585     if(IS_INTRA_PCM(mb_type)){
4586         unsigned int x, y;
4587
4588         // We assume these blocks are very rare so we do not optimize it.
4589         align_get_bits(&s->gb);
4590
4591         // The pixels are stored in the same order as levels in h->mb array.
4592         for(y=0; y<16; y++){
4593             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4594             for(x=0; x<16; x++){
4595                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4596                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4597             }
4598         }
4599         for(y=0; y<8; y++){
4600             const int index= 256 + 4*(y&3) + 32*(y>>2);
4601             for(x=0; x<8; x++){
4602                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4603                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4604             }
4605         }
4606         for(y=0; y<8; y++){
4607             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4608             for(x=0; x<8; x++){
4609                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4610                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4611             }
4612         }
4613
4614         // In deblocking, the quantizer is 0
4615         s->current_picture.qscale_table[mb_xy]= 0;
4616         // All coeffs are present
4617         memset(h->non_zero_count[mb_xy], 16, 16);
4618
4619         s->current_picture.mb_type[mb_xy]= mb_type;
4620         return 0;
4621     }
4622
4623     if(MB_MBAFF){
4624         h->ref_count[0] <<= 1;
4625         h->ref_count[1] <<= 1;
4626     }
4627
4628     fill_caches(h, mb_type, 0);
4629
4630     //mb_pred
4631     if(IS_INTRA(mb_type)){
4632             int pred_mode;
4633 //            init_top_left_availability(h);
4634             if(IS_INTRA4x4(mb_type)){
4635                 int i;
4636                 int di = 1;
4637                 if(dct8x8_allowed && get_bits1(&s->gb)){
4638                     mb_type |= MB_TYPE_8x8DCT;
4639                     di = 4;
4640                 }
4641
4642 //                fill_intra4x4_pred_table(h);
4643                 for(i=0; i<16; i+=di){
4644                     int mode= pred_intra_mode(h, i);
4645
4646                     if(!get_bits1(&s->gb)){
4647                         const int rem_mode= get_bits(&s->gb, 3);
4648                         mode = rem_mode + (rem_mode >= mode);
4649                     }
4650
4651                     if(di==4)
4652                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4653                     else
4654                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4655                 }
4656                 write_back_intra_pred_mode(h);
4657                 if( check_intra4x4_pred_mode(h) < 0)
4658                     return -1;
4659             }else{
4660                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4661                 if(h->intra16x16_pred_mode < 0)
4662                     return -1;
4663             }
4664
4665             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4666             if(pred_mode < 0)
4667                 return -1;
4668             h->chroma_pred_mode= pred_mode;
4669     }else if(partition_count==4){
4670         int i, j, sub_partition_count[4], list, ref[2][4];
4671
4672         if(h->slice_type_nos == FF_B_TYPE){
4673             for(i=0; i<4; i++){
4674                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4675                 if(h->sub_mb_type[i] >=13){
4676                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4677                     return -1;
4678                 }
4679                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4680                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4681             }
4682             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4683                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4684                 pred_direct_motion(h, &mb_type);
4685                 h->ref_cache[0][scan8[4]] =
4686                 h->ref_cache[1][scan8[4]] =
4687                 h->ref_cache[0][scan8[12]] =
4688                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4689             }
4690         }else{
4691             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4692             for(i=0; i<4; i++){
4693                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4694                 if(h->sub_mb_type[i] >=4){
4695                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4696                     return -1;
4697                 }
4698                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4699                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4700             }
4701         }
4702
4703         for(list=0; list<h->list_count; list++){
4704             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4705             for(i=0; i<4; i++){
4706                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4707                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4708                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4709                     if(tmp>=ref_count){
4710                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4711                         return -1;
4712                     }
4713                     ref[list][i]= tmp;
4714                 }else{
4715                  //FIXME
4716                     ref[list][i] = -1;
4717                 }
4718             }
4719         }
4720
4721         if(dct8x8_allowed)
4722             dct8x8_allowed = get_dct8x8_allowed(h);
4723
4724         for(list=0; list<h->list_count; list++){
4725             for(i=0; i<4; i++){
4726                 if(IS_DIRECT(h->sub_mb_type[i])) {
4727                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4728                     continue;
4729                 }
4730                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4731                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4732
4733                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4734                     const int sub_mb_type= h->sub_mb_type[i];
4735                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4736                     for(j=0; j<sub_partition_count[i]; j++){
4737                         int mx, my;
4738                         const int index= 4*i + block_width*j;
4739                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4740                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4741                         mx += get_se_golomb(&s->gb);
4742                         my += get_se_golomb(&s->gb);
4743                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4744
4745                         if(IS_SUB_8X8(sub_mb_type)){
4746                             mv_cache[ 1 ][0]=
4747                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4748                             mv_cache[ 1 ][1]=
4749                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4750                         }else if(IS_SUB_8X4(sub_mb_type)){
4751                             mv_cache[ 1 ][0]= mx;
4752                             mv_cache[ 1 ][1]= my;
4753                         }else if(IS_SUB_4X8(sub_mb_type)){
4754                             mv_cache[ 8 ][0]= mx;
4755                             mv_cache[ 8 ][1]= my;
4756                         }
4757                         mv_cache[ 0 ][0]= mx;
4758                         mv_cache[ 0 ][1]= my;
4759                     }
4760                 }else{
4761                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4762                     p[0] = p[1]=
4763                     p[8] = p[9]= 0;
4764                 }
4765             }
4766         }
4767     }else if(IS_DIRECT(mb_type)){
4768         pred_direct_motion(h, &mb_type);
4769         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4770     }else{
4771         int list, mx, my, i;
4772          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4773         if(IS_16X16(mb_type)){
4774             for(list=0; list<h->list_count; list++){
4775                     unsigned int val;
4776                     if(IS_DIR(mb_type, 0, list)){
4777                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4778                         if(val >= h->ref_count[list]){
4779                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4780                             return -1;
4781                         }
4782                     }else
4783                         val= LIST_NOT_USED&0xFF;
4784                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4785             }
4786             for(list=0; list<h->list_count; list++){
4787                 unsigned int val;
4788                 if(IS_DIR(mb_type, 0, list)){
4789                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4790                     mx += get_se_golomb(&s->gb);
4791                     my += get_se_golomb(&s->gb);
4792                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4793
4794                     val= pack16to32(mx,my);
4795                 }else
4796                     val=0;
4797                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4798             }
4799         }
4800         else if(IS_16X8(mb_type)){
4801             for(list=0; list<h->list_count; list++){
4802                     for(i=0; i<2; i++){
4803                         unsigned int val;
4804                         if(IS_DIR(mb_type, i, list)){
4805                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4806                             if(val >= h->ref_count[list]){
4807                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4808                                 return -1;
4809                             }
4810                         }else
4811                             val= LIST_NOT_USED&0xFF;
4812                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4813                     }
4814             }
4815             for(list=0; list<h->list_count; list++){
4816                 for(i=0; i<2; i++){
4817                     unsigned int val;
4818                     if(IS_DIR(mb_type, i, list)){
4819                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4820                         mx += get_se_golomb(&s->gb);
4821                         my += get_se_golomb(&s->gb);
4822                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4823
4824                         val= pack16to32(mx,my);
4825                     }else
4826                         val=0;
4827                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4828                 }
4829             }
4830         }else{
4831             assert(IS_8X16(mb_type));
4832             for(list=0; list<h->list_count; list++){
4833                     for(i=0; i<2; i++){
4834                         unsigned int val;
4835                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4836                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4837                             if(val >= h->ref_count[list]){
4838                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4839                                 return -1;
4840                             }
4841                         }else
4842                             val= LIST_NOT_USED&0xFF;
4843                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4844                     }
4845             }
4846             for(list=0; list<h->list_count; list++){
4847                 for(i=0; i<2; i++){
4848                     unsigned int val;
4849                     if(IS_DIR(mb_type, i, list)){
4850                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4851                         mx += get_se_golomb(&s->gb);
4852                         my += get_se_golomb(&s->gb);
4853                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4854
4855                         val= pack16to32(mx,my);
4856                     }else
4857                         val=0;
4858                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4859                 }
4860             }
4861         }
4862     }
4863
4864     if(IS_INTER(mb_type))
4865         write_back_motion(h, mb_type);
4866
4867     if(!IS_INTRA16x16(mb_type)){
4868         cbp= get_ue_golomb(&s->gb);
4869         if(cbp > 47){
4870             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4871             return -1;
4872         }
4873
4874         if(IS_INTRA4x4(mb_type))
4875             cbp= golomb_to_intra4x4_cbp[cbp];
4876         else
4877             cbp= golomb_to_inter_cbp[cbp];
4878     }
4879     h->cbp = cbp;
4880
4881     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4882         if(get_bits1(&s->gb))
4883             mb_type |= MB_TYPE_8x8DCT;
4884     }
4885     s->current_picture.mb_type[mb_xy]= mb_type;
4886
4887     if(cbp || IS_INTRA16x16(mb_type)){
4888         int i8x8, i4x4, chroma_idx;
4889         int dquant;
4890         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4891         const uint8_t *scan, *scan8x8, *dc_scan;
4892
4893 //        fill_non_zero_count_cache(h);
4894
4895         if(IS_INTERLACED(mb_type)){
4896             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4897             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4898             dc_scan= luma_dc_field_scan;
4899         }else{
4900             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4901             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4902             dc_scan= luma_dc_zigzag_scan;
4903         }
4904
4905         dquant= get_se_golomb(&s->gb);
4906
4907         if( dquant > 25 || dquant < -26 ){
4908             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4909             return -1;
4910         }
4911
4912         s->qscale += dquant;
4913         if(((unsigned)s->qscale) > 51){
4914             if(s->qscale<0) s->qscale+= 52;
4915             else            s->qscale-= 52;
4916         }
4917
4918         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4919         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4920         if(IS_INTRA16x16(mb_type)){
4921             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4922                 return -1; //FIXME continue if partitioned and other return -1 too
4923             }
4924
4925             assert((cbp&15) == 0 || (cbp&15) == 15);
4926
4927             if(cbp&15){
4928                 for(i8x8=0; i8x8<4; i8x8++){
4929                     for(i4x4=0; i4x4<4; i4x4++){
4930                         const int index= i4x4 + 4*i8x8;
4931                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4932                             return -1;
4933                         }
4934                     }
4935                 }
4936             }else{
4937                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4938             }
4939         }else{
4940             for(i8x8=0; i8x8<4; i8x8++){
4941                 if(cbp & (1<<i8x8)){
4942                     if(IS_8x8DCT(mb_type)){
4943                         DCTELEM *buf = &h->mb[64*i8x8];
4944                         uint8_t *nnz;
4945                         for(i4x4=0; i4x4<4; i4x4++){
4946                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4947                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4948                                 return -1;
4949                         }
4950                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4951                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4952                     }else{
4953                         for(i4x4=0; i4x4<4; i4x4++){
4954                             const int index= i4x4 + 4*i8x8;
4955
4956                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4957                                 return -1;
4958                             }
4959                         }
4960                     }
4961                 }else{
4962                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4963                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4964                 }
4965             }
4966         }
4967
4968         if(cbp&0x30){
4969             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4970                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4971                     return -1;
4972                 }
4973         }
4974
4975         if(cbp&0x20){
4976             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4977                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4978                 for(i4x4=0; i4x4<4; i4x4++){
4979                     const int index= 16 + 4*chroma_idx + i4x4;
4980                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4981                         return -1;
4982                     }
4983                 }
4984             }
4985         }else{
4986             uint8_t * const nnz= &h->non_zero_count_cache[0];
4987             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4988             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4989         }
4990     }else{
4991         uint8_t * const nnz= &h->non_zero_count_cache[0];
4992         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4993         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4994         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4995     }
4996     s->current_picture.qscale_table[mb_xy]= s->qscale;
4997     write_back_non_zero_count(h);
4998
4999     if(MB_MBAFF){
5000         h->ref_count[0] >>= 1;
5001         h->ref_count[1] >>= 1;
5002     }
5003
5004     return 0;
5005 }
5006
5007 static int decode_cabac_field_decoding_flag(H264Context *h) {
5008     MpegEncContext * const s = &h->s;
5009     const int mb_x = s->mb_x;
5010     const int mb_y = s->mb_y & ~1;
5011     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5012     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5013
5014     unsigned int ctx = 0;
5015
5016     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5017         ctx += 1;
5018     }
5019     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5020         ctx += 1;
5021     }
5022
5023     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5024 }
5025
5026 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5027     uint8_t *state= &h->cabac_state[ctx_base];
5028     int mb_type;
5029
5030     if(intra_slice){
5031         MpegEncContext * const s = &h->s;
5032         const int mba_xy = h->left_mb_xy[0];
5033         const int mbb_xy = h->top_mb_xy;
5034         int ctx=0;
5035         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5036             ctx++;
5037         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5038             ctx++;
5039         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5040             return 0;   /* I4x4 */
5041         state += 2;
5042     }else{
5043         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5044             return 0;   /* I4x4 */
5045     }
5046
5047     if( get_cabac_terminate( &h->cabac ) )
5048         return 25;  /* PCM */
5049
5050     mb_type = 1; /* I16x16 */
5051     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5052     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5053         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5054     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5055     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5056     return mb_type;
5057 }
5058
5059 static int decode_cabac_mb_type( H264Context *h ) {
5060     MpegEncContext * const s = &h->s;
5061
5062     if( h->slice_type_nos == FF_I_TYPE ) {
5063         return decode_cabac_intra_mb_type(h, 3, 1);
5064     } else if( h->slice_type_nos == FF_P_TYPE ) {
5065         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5066             /* P-type */
5067             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5068                 /* P_L0_D16x16, P_8x8 */
5069                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5070             } else {
5071                 /* P_L0_D8x16, P_L0_D16x8 */
5072                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5073             }
5074         } else {
5075             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5076         }
5077     } else if( h->slice_type_nos == FF_B_TYPE ) {
5078         const int mba_xy = h->left_mb_xy[0];
5079         const int mbb_xy = h->top_mb_xy;
5080         int ctx = 0;
5081         int bits;
5082
5083         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5084             ctx++;
5085         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5086             ctx++;
5087
5088         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5089             return 0; /* B_Direct_16x16 */
5090
5091         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5092             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5093         }
5094
5095         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5096         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5097         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5098         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5099         if( bits < 8 )
5100             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5101         else if( bits == 13 ) {
5102             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5103         } else if( bits == 14 )
5104             return 11; /* B_L1_L0_8x16 */
5105         else if( bits == 15 )
5106             return 22; /* B_8x8 */
5107
5108         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5109         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5110     } else {
5111         /* TODO SI/SP frames? */
5112         return -1;
5113     }
5114 }
5115
5116 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5117     MpegEncContext * const s = &h->s;
5118     int mba_xy, mbb_xy;
5119     int ctx = 0;
5120
5121     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5122         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5123         mba_xy = mb_xy - 1;
5124         if( (mb_y&1)
5125             && h->slice_table[mba_xy] == h->slice_num
5126             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5127             mba_xy += s->mb_stride;
5128         if( MB_FIELD ){
5129             mbb_xy = mb_xy - s->mb_stride;
5130             if( !(mb_y&1)
5131                 && h->slice_table[mbb_xy] == h->slice_num
5132                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5133                 mbb_xy -= s->mb_stride;
5134         }else
5135             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5136     }else{
5137         int mb_xy = h->mb_xy;
5138         mba_xy = mb_xy - 1;
5139         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5140     }
5141
5142     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5143         ctx++;
5144     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5145         ctx++;
5146
5147     if( h->slice_type_nos == FF_B_TYPE )
5148         ctx += 13;
5149     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5150 }
5151
5152 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5153     int mode = 0;
5154
5155     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5156         return pred_mode;
5157
5158     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5159     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5160     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5161
5162     if( mode >= pred_mode )
5163         return mode + 1;
5164     else
5165         return mode;
5166 }
5167
5168 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5169     const int mba_xy = h->left_mb_xy[0];
5170     const int mbb_xy = h->top_mb_xy;
5171
5172     int ctx = 0;
5173
5174     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5175     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5176         ctx++;
5177
5178     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5179         ctx++;
5180
5181     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5182         return 0;
5183
5184     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5185         return 1;
5186     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5187         return 2;
5188     else
5189         return 3;
5190 }
5191
5192 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5193     int cbp_b, cbp_a, ctx, cbp = 0;
5194
5195     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5196     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5197
5198     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5199     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5200     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5201     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5202     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5203     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5204     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5205     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5206     return cbp;
5207 }
5208 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5209     int ctx;
5210     int cbp_a, cbp_b;
5211
5212     cbp_a = (h->left_cbp>>4)&0x03;
5213     cbp_b = (h-> top_cbp>>4)&0x03;
5214
5215     ctx = 0;
5216     if( cbp_a > 0 ) ctx++;
5217     if( cbp_b > 0 ) ctx += 2;
5218     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5219         return 0;
5220
5221     ctx = 4;
5222     if( cbp_a == 2 ) ctx++;
5223     if( cbp_b == 2 ) ctx += 2;
5224     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5225 }
5226 static int decode_cabac_mb_dqp( H264Context *h) {
5227     int   ctx = 0;
5228     int   val = 0;
5229
5230     if( h->last_qscale_diff != 0 )
5231         ctx++;
5232
5233     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5234         if( ctx < 2 )
5235             ctx = 2;
5236         else
5237             ctx = 3;
5238         val++;
5239         if(val > 102) //prevent infinite loop
5240             return INT_MIN;
5241     }
5242
5243     if( val&0x01 )
5244         return (val + 1)/2;
5245     else
5246         return -(val + 1)/2;
5247 }
5248 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5249     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5250         return 0;   /* 8x8 */
5251     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5252         return 1;   /* 8x4 */
5253     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5254         return 2;   /* 4x8 */
5255     return 3;       /* 4x4 */
5256 }
5257 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5258     int type;
5259     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5260         return 0;   /* B_Direct_8x8 */
5261     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5262         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5263     type = 3;
5264     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5265         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5266             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5267         type += 4;
5268     }
5269     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5270     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5271     return type;
5272 }
5273
5274 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5275     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5276 }
5277
5278 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5279     int refa = h->ref_cache[list][scan8[n] - 1];
5280     int refb = h->ref_cache[list][scan8[n] - 8];
5281     int ref  = 0;
5282     int ctx  = 0;
5283
5284     if( h->slice_type_nos == FF_B_TYPE) {
5285         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5286             ctx++;
5287         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5288             ctx += 2;
5289     } else {
5290         if( refa > 0 )
5291             ctx++;
5292         if( refb > 0 )
5293             ctx += 2;
5294     }
5295
5296     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5297         ref++;
5298         if( ctx < 4 )
5299             ctx = 4;
5300         else
5301             ctx = 5;
5302         if(ref >= 32 /*h->ref_list[list]*/){
5303             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5304             return 0; //FIXME we should return -1 and check the return everywhere
5305         }
5306     }
5307     return ref;
5308 }
5309
5310 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5311     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5312                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5313     int ctxbase = (l == 0) ? 40 : 47;
5314     int ctx, mvd;
5315
5316     if( amvd < 3 )
5317         ctx = 0;
5318     else if( amvd > 32 )
5319         ctx = 2;
5320     else
5321         ctx = 1;
5322
5323     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5324         return 0;
5325
5326     mvd= 1;
5327     ctx= 3;
5328     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5329         mvd++;
5330         if( ctx < 6 )
5331             ctx++;
5332     }
5333
5334     if( mvd >= 9 ) {
5335         int k = 3;
5336         while( get_cabac_bypass( &h->cabac ) ) {
5337             mvd += 1 << k;
5338             k++;
5339             if(k>24){
5340                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5341                 return INT_MIN;
5342             }
5343         }
5344         while( k-- ) {
5345             if( get_cabac_bypass( &h->cabac ) )
5346                 mvd += 1 << k;
5347         }
5348     }
5349     return get_cabac_bypass_sign( &h->cabac, -mvd );
5350 }
5351
5352 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5353     int nza, nzb;
5354     int ctx = 0;
5355
5356     if( is_dc ) {
5357         if( cat == 0 ) {
5358             nza = h->left_cbp&0x100;
5359             nzb = h-> top_cbp&0x100;
5360         } else {
5361             nza = (h->left_cbp>>(6+idx))&0x01;
5362             nzb = (h-> top_cbp>>(6+idx))&0x01;
5363         }
5364     } else {
5365         if( cat == 4 ) {
5366             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5367             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5368         } else {
5369             assert(cat == 1 || cat == 2);
5370             nza = h->non_zero_count_cache[scan8[idx] - 1];
5371             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5372         }
5373     }
5374
5375     if( nza > 0 )
5376         ctx++;
5377
5378     if( nzb > 0 )
5379         ctx += 2;
5380
5381     return ctx + 4 * cat;
5382 }
5383
5384 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5385     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5386     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5387     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5388     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5389 };
5390
5391 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5392     static const int significant_coeff_flag_offset[2][6] = {
5393       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5394       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5395     };
5396     static const int last_coeff_flag_offset[2][6] = {
5397       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5398       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5399     };
5400     static const int coeff_abs_level_m1_offset[6] = {
5401         227+0, 227+10, 227+20, 227+30, 227+39, 426
5402     };
5403     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5404       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5405         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5406         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5407        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5408       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5409         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5410         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5411         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5412     };
5413     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5414      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5415      * map node ctx => cabac ctx for level=1 */
5416     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5417     /* map node ctx => cabac ctx for level>1 */
5418     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5419     static const uint8_t coeff_abs_level_transition[2][8] = {
5420     /* update node ctx after decoding a level=1 */
5421         { 1, 2, 3, 3, 4, 5, 6, 7 },
5422     /* update node ctx after decoding a level>1 */
5423         { 4, 4, 4, 4, 5, 6, 7, 7 }
5424     };
5425
5426     int index[64];
5427
5428     int av_unused last;
5429     int coeff_count = 0;
5430     int node_ctx = 0;
5431
5432     uint8_t *significant_coeff_ctx_base;
5433     uint8_t *last_coeff_ctx_base;
5434     uint8_t *abs_level_m1_ctx_base;
5435
5436 #ifndef ARCH_X86
5437 #define CABAC_ON_STACK
5438 #endif
5439 #ifdef CABAC_ON_STACK
5440 #define CC &cc
5441     CABACContext cc;
5442     cc.range     = h->cabac.range;
5443     cc.low       = h->cabac.low;
5444     cc.bytestream= h->cabac.bytestream;
5445 #else
5446 #define CC &h->cabac
5447 #endif
5448
5449
5450     /* cat: 0-> DC 16x16  n = 0
5451      *      1-> AC 16x16  n = luma4x4idx
5452      *      2-> Luma4x4   n = luma4x4idx
5453      *      3-> DC Chroma n = iCbCr
5454      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5455      *      5-> Luma8x8   n = 4 * luma8x8idx
5456      */
5457
5458     /* read coded block flag */
5459     if( is_dc || cat != 5 ) {
5460         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5461             if( !is_dc ) {
5462                 if( cat == 4 )
5463                     h->non_zero_count_cache[scan8[16+n]] = 0;
5464                 else
5465                     h->non_zero_count_cache[scan8[n]] = 0;
5466             }
5467
5468 #ifdef CABAC_ON_STACK
5469             h->cabac.range     = cc.range     ;
5470             h->cabac.low       = cc.low       ;
5471             h->cabac.bytestream= cc.bytestream;
5472 #endif
5473             return;
5474         }
5475     }
5476
5477     significant_coeff_ctx_base = h->cabac_state
5478         + significant_coeff_flag_offset[MB_FIELD][cat];
5479     last_coeff_ctx_base = h->cabac_state
5480         + last_coeff_flag_offset[MB_FIELD][cat];
5481     abs_level_m1_ctx_base = h->cabac_state
5482         + coeff_abs_level_m1_offset[cat];
5483
5484     if( !is_dc && cat == 5 ) {
5485 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5486         for(last= 0; last < coefs; last++) { \
5487             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5488             if( get_cabac( CC, sig_ctx )) { \
5489                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5490                 index[coeff_count++] = last; \
5491                 if( get_cabac( CC, last_ctx ) ) { \
5492                     last= max_coeff; \
5493                     break; \
5494                 } \
5495             } \
5496         }\
5497         if( last == max_coeff -1 ) {\
5498             index[coeff_count++] = last;\
5499         }
5500         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5501 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5502         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5503     } else {
5504         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5505 #else
5506         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5507     } else {
5508         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5509 #endif
5510     }
5511     assert(coeff_count > 0);
5512
5513     if( is_dc ) {
5514         if( cat == 0 )
5515             h->cbp_table[h->mb_xy] |= 0x100;
5516         else
5517             h->cbp_table[h->mb_xy] |= 0x40 << n;
5518     } else {
5519         if( cat == 5 )
5520             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5521         else if( cat == 4 )
5522             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5523         else {
5524             assert( cat == 1 || cat == 2 );
5525             h->non_zero_count_cache[scan8[n]] = coeff_count;
5526         }
5527     }
5528
5529     while( coeff_count-- ) {
5530         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5531
5532         int j= scantable[index[coeff_count]];
5533
5534         if( get_cabac( CC, ctx ) == 0 ) {
5535             node_ctx = coeff_abs_level_transition[0][node_ctx];
5536             if( is_dc ) {
5537                 block[j] = get_cabac_bypass_sign( CC, -1);
5538             }else{
5539                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5540             }
5541         } else {
5542             int coeff_abs = 2;
5543             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5544             node_ctx = coeff_abs_level_transition[1][node_ctx];
5545
5546             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5547                 coeff_abs++;
5548             }
5549
5550             if( coeff_abs >= 15 ) {
5551                 int j = 0;
5552                 while( get_cabac_bypass( CC ) ) {
5553                     j++;
5554                 }
5555
5556                 coeff_abs=1;
5557                 while( j-- ) {
5558                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5559                 }
5560                 coeff_abs+= 14;
5561             }
5562
5563             if( is_dc ) {
5564                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5565             }else{
5566                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5567             }
5568         }
5569     }
5570 #ifdef CABAC_ON_STACK
5571             h->cabac.range     = cc.range     ;
5572             h->cabac.low       = cc.low       ;
5573             h->cabac.bytestream= cc.bytestream;
5574 #endif
5575
5576 }
5577
5578 #ifndef CONFIG_SMALL
5579 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5580     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5581 }
5582
5583 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5584     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5585 }
5586 #endif
5587
5588 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5589 #ifdef CONFIG_SMALL
5590     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5591 #else
5592     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5593     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5594 #endif
5595 }
5596
5597 static inline void compute_mb_neighbors(H264Context *h)
5598 {
5599     MpegEncContext * const s = &h->s;
5600     const int mb_xy  = h->mb_xy;
5601     h->top_mb_xy     = mb_xy - s->mb_stride;
5602     h->left_mb_xy[0] = mb_xy - 1;
5603     if(FRAME_MBAFF){
5604         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5605         const int top_pair_xy      = pair_xy     - s->mb_stride;
5606         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5607         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5608         const int curr_mb_frame_flag = !MB_FIELD;
5609         const int bottom = (s->mb_y & 1);
5610         if (bottom
5611                 ? !curr_mb_frame_flag // bottom macroblock
5612                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5613                 ) {
5614             h->top_mb_xy -= s->mb_stride;
5615         }
5616         if (left_mb_frame_flag != curr_mb_frame_flag) {
5617             h->left_mb_xy[0] = pair_xy - 1;
5618         }
5619     } else if (FIELD_PICTURE) {
5620         h->top_mb_xy -= s->mb_stride;
5621     }
5622     return;
5623 }
5624
5625 /**
5626  * decodes a macroblock
5627  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5628  */
5629 static int decode_mb_cabac(H264Context *h) {
5630     MpegEncContext * const s = &h->s;
5631     int mb_xy;
5632     int mb_type, partition_count, cbp = 0;
5633     int dct8x8_allowed= h->pps.transform_8x8_mode;
5634
5635     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5636
5637     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5638
5639     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5640     if( h->slice_type_nos != FF_I_TYPE ) {
5641         int skip;
5642         /* a skipped mb needs the aff flag from the following mb */
5643         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5644             predict_field_decoding_flag(h);
5645         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5646             skip = h->next_mb_skipped;
5647         else
5648             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5649         /* read skip flags */
5650         if( skip ) {
5651             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5652                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5653                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5654                 if(h->next_mb_skipped)
5655                     predict_field_decoding_flag(h);
5656                 else
5657                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5658             }
5659
5660             decode_mb_skip(h);
5661
5662             h->cbp_table[mb_xy] = 0;
5663             h->chroma_pred_mode_table[mb_xy] = 0;
5664             h->last_qscale_diff = 0;
5665
5666             return 0;
5667
5668         }
5669     }
5670     if(FRAME_MBAFF){
5671         if( (s->mb_y&1) == 0 )
5672             h->mb_mbaff =
5673             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5674     }else
5675         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5676
5677     h->prev_mb_skipped = 0;
5678
5679     compute_mb_neighbors(h);
5680     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5681         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5682         return -1;
5683     }
5684
5685     if( h->slice_type_nos == FF_B_TYPE ) {
5686         if( mb_type < 23 ){
5687             partition_count= b_mb_type_info[mb_type].partition_count;
5688             mb_type=         b_mb_type_info[mb_type].type;
5689         }else{
5690             mb_type -= 23;
5691             goto decode_intra_mb;
5692         }
5693     } else if( h->slice_type_nos == FF_P_TYPE ) {
5694         if( mb_type < 5) {
5695             partition_count= p_mb_type_info[mb_type].partition_count;
5696             mb_type=         p_mb_type_info[mb_type].type;
5697         } else {
5698             mb_type -= 5;
5699             goto decode_intra_mb;
5700         }
5701     } else {
5702         if(h->slice_type == FF_SI_TYPE && mb_type)
5703             mb_type--;
5704         assert(h->slice_type_nos == FF_I_TYPE);
5705 decode_intra_mb:
5706         partition_count = 0;
5707         cbp= i_mb_type_info[mb_type].cbp;
5708         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5709         mb_type= i_mb_type_info[mb_type].type;
5710     }
5711     if(MB_FIELD)
5712         mb_type |= MB_TYPE_INTERLACED;
5713
5714     h->slice_table[ mb_xy ]= h->slice_num;
5715
5716     if(IS_INTRA_PCM(mb_type)) {
5717         const uint8_t *ptr;
5718         unsigned int x, y;
5719
5720         // We assume these blocks are very rare so we do not optimize it.
5721         // FIXME The two following lines get the bitstream position in the cabac
5722         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5723         ptr= h->cabac.bytestream;
5724         if(h->cabac.low&0x1) ptr--;
5725         if(CABAC_BITS==16){
5726             if(h->cabac.low&0x1FF) ptr--;
5727         }
5728
5729         // The pixels are stored in the same order as levels in h->mb array.
5730         for(y=0; y<16; y++){
5731             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5732             for(x=0; x<16; x++){
5733                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5734                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5735             }
5736         }
5737         for(y=0; y<8; y++){
5738             const int index= 256 + 4*(y&3) + 32*(y>>2);
5739             for(x=0; x<8; x++){
5740                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5741                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5742             }
5743         }
5744         for(y=0; y<8; y++){
5745             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5746             for(x=0; x<8; x++){
5747                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5748                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5749             }
5750         }
5751
5752         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5753
5754         // All blocks are present
5755         h->cbp_table[mb_xy] = 0x1ef;
5756         h->chroma_pred_mode_table[mb_xy] = 0;
5757         // In deblocking, the quantizer is 0
5758         s->current_picture.qscale_table[mb_xy]= 0;
5759         // All coeffs are present
5760         memset(h->non_zero_count[mb_xy], 16, 16);
5761         s->current_picture.mb_type[mb_xy]= mb_type;
5762         h->last_qscale_diff = 0;
5763         return 0;
5764     }
5765
5766     if(MB_MBAFF){
5767         h->ref_count[0] <<= 1;
5768         h->ref_count[1] <<= 1;
5769     }
5770
5771     fill_caches(h, mb_type, 0);
5772
5773     if( IS_INTRA( mb_type ) ) {
5774         int i, pred_mode;
5775         if( IS_INTRA4x4( mb_type ) ) {
5776             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5777                 mb_type |= MB_TYPE_8x8DCT;
5778                 for( i = 0; i < 16; i+=4 ) {
5779                     int pred = pred_intra_mode( h, i );
5780                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5781                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5782                 }
5783             } else {
5784                 for( i = 0; i < 16; i++ ) {
5785                     int pred = pred_intra_mode( h, i );
5786                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5787
5788                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5789                 }
5790             }
5791             write_back_intra_pred_mode(h);
5792             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5793         } else {
5794             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5795             if( h->intra16x16_pred_mode < 0 ) return -1;
5796         }
5797         h->chroma_pred_mode_table[mb_xy] =
5798         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5799
5800         pred_mode= check_intra_pred_mode( h, pred_mode );
5801         if( pred_mode < 0 ) return -1;
5802         h->chroma_pred_mode= pred_mode;
5803     } else if( partition_count == 4 ) {
5804         int i, j, sub_partition_count[4], list, ref[2][4];
5805
5806         if( h->slice_type_nos == FF_B_TYPE ) {
5807             for( i = 0; i < 4; i++ ) {
5808                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5809                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5810                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5811             }
5812             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5813                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5814                 pred_direct_motion(h, &mb_type);
5815                 h->ref_cache[0][scan8[4]] =
5816                 h->ref_cache[1][scan8[4]] =
5817                 h->ref_cache[0][scan8[12]] =
5818                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5819                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5820                     for( i = 0; i < 4; i++ )
5821                         if( IS_DIRECT(h->sub_mb_type[i]) )
5822                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5823                 }
5824             }
5825         } else {
5826             for( i = 0; i < 4; i++ ) {
5827                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5828                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5829                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5830             }
5831         }
5832
5833         for( list = 0; list < h->list_count; list++ ) {
5834                 for( i = 0; i < 4; i++ ) {
5835                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5836                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5837                         if( h->ref_count[list] > 1 )
5838                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5839                         else
5840                             ref[list][i] = 0;
5841                     } else {
5842                         ref[list][i] = -1;
5843                     }
5844                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5845                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5846                 }
5847         }
5848
5849         if(dct8x8_allowed)
5850             dct8x8_allowed = get_dct8x8_allowed(h);
5851
5852         for(list=0; list<h->list_count; list++){
5853             for(i=0; i<4; i++){
5854                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5855                 if(IS_DIRECT(h->sub_mb_type[i])){
5856                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5857                     continue;
5858                 }
5859
5860                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5861                     const int sub_mb_type= h->sub_mb_type[i];
5862                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5863                     for(j=0; j<sub_partition_count[i]; j++){
5864                         int mpx, mpy;
5865                         int mx, my;
5866                         const int index= 4*i + block_width*j;
5867                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5868                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5869                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5870
5871                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5872                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5873                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5874
5875                         if(IS_SUB_8X8(sub_mb_type)){
5876                             mv_cache[ 1 ][0]=
5877                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5878                             mv_cache[ 1 ][1]=
5879                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5880
5881                             mvd_cache[ 1 ][0]=
5882                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5883                             mvd_cache[ 1 ][1]=
5884                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5885                         }else if(IS_SUB_8X4(sub_mb_type)){
5886                             mv_cache[ 1 ][0]= mx;
5887                             mv_cache[ 1 ][1]= my;
5888
5889                             mvd_cache[ 1 ][0]= mx - mpx;
5890                             mvd_cache[ 1 ][1]= my - mpy;
5891                         }else if(IS_SUB_4X8(sub_mb_type)){
5892                             mv_cache[ 8 ][0]= mx;
5893                             mv_cache[ 8 ][1]= my;
5894
5895                             mvd_cache[ 8 ][0]= mx - mpx;
5896                             mvd_cache[ 8 ][1]= my - mpy;
5897                         }
5898                         mv_cache[ 0 ][0]= mx;
5899                         mv_cache[ 0 ][1]= my;
5900
5901                         mvd_cache[ 0 ][0]= mx - mpx;
5902                         mvd_cache[ 0 ][1]= my - mpy;
5903                     }
5904                 }else{
5905                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5906                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5907                     p[0] = p[1] = p[8] = p[9] = 0;
5908                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5909                 }
5910             }
5911         }
5912     } else if( IS_DIRECT(mb_type) ) {
5913         pred_direct_motion(h, &mb_type);
5914         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5915         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5916         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5917     } else {
5918         int list, mx, my, i, mpx, mpy;
5919         if(IS_16X16(mb_type)){
5920             for(list=0; list<h->list_count; list++){
5921                 if(IS_DIR(mb_type, 0, list)){
5922                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5923                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5924                 }else
5925                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5926             }
5927             for(list=0; list<h->list_count; list++){
5928                 if(IS_DIR(mb_type, 0, list)){
5929                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5930
5931                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5932                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5933                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5934
5935                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5936                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5937                 }else
5938                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5939             }
5940         }
5941         else if(IS_16X8(mb_type)){
5942             for(list=0; list<h->list_count; list++){
5943                     for(i=0; i<2; i++){
5944                         if(IS_DIR(mb_type, i, list)){
5945                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5946                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5947                         }else
5948                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5949                     }
5950             }
5951             for(list=0; list<h->list_count; list++){
5952                 for(i=0; i<2; i++){
5953                     if(IS_DIR(mb_type, i, list)){
5954                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5955                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5956                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5957                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5958
5959                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5960                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5961                     }else{
5962                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5963                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5964                     }
5965                 }
5966             }
5967         }else{
5968             assert(IS_8X16(mb_type));
5969             for(list=0; list<h->list_count; list++){
5970                     for(i=0; i<2; i++){
5971                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5972                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5973                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5974                         }else
5975                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5976                     }
5977             }
5978             for(list=0; list<h->list_count; list++){
5979                 for(i=0; i<2; i++){
5980                     if(IS_DIR(mb_type, i, list)){
5981                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5982                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5983                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5984
5985                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5986                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5987                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5988                     }else{
5989                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5990                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5991                     }
5992                 }
5993             }
5994         }
5995     }
5996
5997    if( IS_INTER( mb_type ) ) {
5998         h->chroma_pred_mode_table[mb_xy] = 0;
5999         write_back_motion( h, mb_type );
6000    }
6001
6002     if( !IS_INTRA16x16( mb_type ) ) {
6003         cbp  = decode_cabac_mb_cbp_luma( h );
6004         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6005     }
6006
6007     h->cbp_table[mb_xy] = h->cbp = cbp;
6008
6009     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6010         if( decode_cabac_mb_transform_size( h ) )
6011             mb_type |= MB_TYPE_8x8DCT;
6012     }
6013     s->current_picture.mb_type[mb_xy]= mb_type;
6014
6015     if( cbp || IS_INTRA16x16( mb_type ) ) {
6016         const uint8_t *scan, *scan8x8, *dc_scan;
6017         const uint32_t *qmul;
6018         int dqp;
6019
6020         if(IS_INTERLACED(mb_type)){
6021             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6022             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6023             dc_scan= luma_dc_field_scan;
6024         }else{
6025             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6026             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6027             dc_scan= luma_dc_zigzag_scan;
6028         }
6029
6030         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6031         if( dqp == INT_MIN ){
6032             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6033             return -1;
6034         }
6035         s->qscale += dqp;
6036         if(((unsigned)s->qscale) > 51){
6037             if(s->qscale<0) s->qscale+= 52;
6038             else            s->qscale-= 52;
6039         }
6040         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6041         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6042
6043         if( IS_INTRA16x16( mb_type ) ) {
6044             int i;
6045             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6046             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6047
6048             if( cbp&15 ) {
6049                 qmul = h->dequant4_coeff[0][s->qscale];
6050                 for( i = 0; i < 16; i++ ) {
6051                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6052                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6053                 }
6054             } else {
6055                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6056             }
6057         } else {
6058             int i8x8, i4x4;
6059             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6060                 if( cbp & (1<<i8x8) ) {
6061                     if( IS_8x8DCT(mb_type) ) {
6062                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6063                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6064                     } else {
6065                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6066                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6067                             const int index = 4*i8x8 + i4x4;
6068                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6069 //START_TIMER
6070                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6071 //STOP_TIMER("decode_residual")
6072                         }
6073                     }
6074                 } else {
6075                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6076                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6077                 }
6078             }
6079         }
6080
6081         if( cbp&0x30 ){
6082             int c;
6083             for( c = 0; c < 2; c++ ) {
6084                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6085                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6086             }
6087         }
6088
6089         if( cbp&0x20 ) {
6090             int c, i;
6091             for( c = 0; c < 2; c++ ) {
6092                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6093                 for( i = 0; i < 4; i++ ) {
6094                     const int index = 16 + 4 * c + i;
6095                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6096                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6097                 }
6098             }
6099         } else {
6100             uint8_t * const nnz= &h->non_zero_count_cache[0];
6101             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6102             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6103         }
6104     } else {
6105         uint8_t * const nnz= &h->non_zero_count_cache[0];
6106         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6107         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6108         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6109         h->last_qscale_diff = 0;
6110     }
6111
6112     s->current_picture.qscale_table[mb_xy]= s->qscale;
6113     write_back_non_zero_count(h);
6114
6115     if(MB_MBAFF){
6116         h->ref_count[0] >>= 1;
6117         h->ref_count[1] >>= 1;
6118     }
6119
6120     return 0;
6121 }
6122
6123
6124 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6125     int i, d;
6126     const int index_a = qp + h->slice_alpha_c0_offset;
6127     const int alpha = (alpha_table+52)[index_a];
6128     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6129
6130     if( bS[0] < 4 ) {
6131         int8_t tc[4];
6132         for(i=0; i<4; i++)
6133             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6134         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6135     } else {
6136         /* 16px edge length, because bS=4 is triggered by being at
6137          * the edge of an intra MB, so all 4 bS are the same */
6138             for( d = 0; d < 16; d++ ) {
6139                 const int p0 = pix[-1];
6140                 const int p1 = pix[-2];
6141                 const int p2 = pix[-3];
6142
6143                 const int q0 = pix[0];
6144                 const int q1 = pix[1];
6145                 const int q2 = pix[2];
6146
6147                 if( FFABS( p0 - q0 ) < alpha &&
6148                     FFABS( p1 - p0 ) < beta &&
6149                     FFABS( q1 - q0 ) < beta ) {
6150
6151                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6152                         if( FFABS( p2 - p0 ) < beta)
6153                         {
6154                             const int p3 = pix[-4];
6155                             /* p0', p1', p2' */
6156                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6157                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6158                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6159                         } else {
6160                             /* p0' */
6161                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6162                         }
6163                         if( FFABS( q2 - q0 ) < beta)
6164                         {
6165                             const int q3 = pix[3];
6166                             /* q0', q1', q2' */
6167                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6168                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6169                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6170                         } else {
6171                             /* q0' */
6172                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6173                         }
6174                     }else{
6175                         /* p0', q0' */
6176                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6177                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6178                     }
6179                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6180                 }
6181                 pix += stride;
6182             }
6183     }
6184 }
6185 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6186     int i;
6187     const int index_a = qp + h->slice_alpha_c0_offset;
6188     const int alpha = (alpha_table+52)[index_a];
6189     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6190
6191     if( bS[0] < 4 ) {
6192         int8_t tc[4];
6193         for(i=0; i<4; i++)
6194             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6195         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6196     } else {
6197         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6198     }
6199 }
6200
6201 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6202     int i;
6203     for( i = 0; i < 16; i++, pix += stride) {
6204         int index_a;
6205         int alpha;
6206         int beta;
6207
6208         int qp_index;
6209         int bS_index = (i >> 1);
6210         if (!MB_FIELD) {
6211             bS_index &= ~1;
6212             bS_index |= (i & 1);
6213         }
6214
6215         if( bS[bS_index] == 0 ) {
6216             continue;
6217         }
6218
6219         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6220         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6221         alpha = (alpha_table+52)[index_a];
6222         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6223
6224         if( bS[bS_index] < 4 ) {
6225             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6226             const int p0 = pix[-1];
6227             const int p1 = pix[-2];
6228             const int p2 = pix[-3];
6229             const int q0 = pix[0];
6230             const int q1 = pix[1];
6231             const int q2 = pix[2];
6232
6233             if( FFABS( p0 - q0 ) < alpha &&
6234                 FFABS( p1 - p0 ) < beta &&
6235                 FFABS( q1 - q0 ) < beta ) {
6236                 int tc = tc0;
6237                 int i_delta;
6238
6239                 if( FFABS( p2 - p0 ) < beta ) {
6240                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6241                     tc++;
6242                 }
6243                 if( FFABS( q2 - q0 ) < beta ) {
6244                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6245                     tc++;
6246                 }
6247
6248                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6249                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6250                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6251                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6252             }
6253         }else{
6254             const int p0 = pix[-1];
6255             const int p1 = pix[-2];
6256             const int p2 = pix[-3];
6257
6258             const int q0 = pix[0];
6259             const int q1 = pix[1];
6260             const int q2 = pix[2];
6261
6262             if( FFABS( p0 - q0 ) < alpha &&
6263                 FFABS( p1 - p0 ) < beta &&
6264                 FFABS( q1 - q0 ) < beta ) {
6265
6266                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6267                     if( FFABS( p2 - p0 ) < beta)
6268                     {
6269                         const int p3 = pix[-4];
6270                         /* p0', p1', p2' */
6271                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6272                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6273                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6274                     } else {
6275                         /* p0' */
6276                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6277                     }
6278                     if( FFABS( q2 - q0 ) < beta)
6279                     {
6280                         const int q3 = pix[3];
6281                         /* q0', q1', q2' */
6282                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6283                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6284                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6285                     } else {
6286                         /* q0' */
6287                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6288                     }
6289                 }else{
6290                     /* p0', q0' */
6291                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6292                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6293                 }
6294                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6295             }
6296         }
6297     }
6298 }
6299 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6300     int i;
6301     for( i = 0; i < 8; i++, pix += stride) {
6302         int index_a;
6303         int alpha;
6304         int beta;
6305
6306         int qp_index;
6307         int bS_index = i;
6308
6309         if( bS[bS_index] == 0 ) {
6310             continue;
6311         }
6312
6313         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6314         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6315         alpha = (alpha_table+52)[index_a];
6316         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6317
6318         if( bS[bS_index] < 4 ) {
6319             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6320             const int p0 = pix[-1];
6321             const int p1 = pix[-2];
6322             const int q0 = pix[0];
6323             const int q1 = pix[1];
6324
6325             if( FFABS( p0 - q0 ) < alpha &&
6326                 FFABS( p1 - p0 ) < beta &&
6327                 FFABS( q1 - q0 ) < beta ) {
6328                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6329
6330                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6331                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6332                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6333             }
6334         }else{
6335             const int p0 = pix[-1];
6336             const int p1 = pix[-2];
6337             const int q0 = pix[0];
6338             const int q1 = pix[1];
6339
6340             if( FFABS( p0 - q0 ) < alpha &&
6341                 FFABS( p1 - p0 ) < beta &&
6342                 FFABS( q1 - q0 ) < beta ) {
6343
6344                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6345                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6346                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6347             }
6348         }
6349     }
6350 }
6351
6352 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6353     int i, d;
6354     const int index_a = qp + h->slice_alpha_c0_offset;
6355     const int alpha = (alpha_table+52)[index_a];
6356     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6357     const int pix_next  = stride;
6358
6359     if( bS[0] < 4 ) {
6360         int8_t tc[4];
6361         for(i=0; i<4; i++)
6362             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6363         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6364     } else {
6365         /* 16px edge length, see filter_mb_edgev */
6366             for( d = 0; d < 16; d++ ) {
6367                 const int p0 = pix[-1*pix_next];
6368                 const int p1 = pix[-2*pix_next];
6369                 const int p2 = pix[-3*pix_next];
6370                 const int q0 = pix[0];
6371                 const int q1 = pix[1*pix_next];
6372                 const int q2 = pix[2*pix_next];
6373
6374                 if( FFABS( p0 - q0 ) < alpha &&
6375                     FFABS( p1 - p0 ) < beta &&
6376                     FFABS( q1 - q0 ) < beta ) {
6377
6378                     const int p3 = pix[-4*pix_next];
6379                     const int q3 = pix[ 3*pix_next];
6380
6381                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6382                         if( FFABS( p2 - p0 ) < beta) {
6383                             /* p0', p1', p2' */
6384                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6385                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6386                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6387                         } else {
6388                             /* p0' */
6389                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6390                         }
6391                         if( FFABS( q2 - q0 ) < beta) {
6392                             /* q0', q1', q2' */
6393                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6394                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6395                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6396                         } else {
6397                             /* q0' */
6398                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6399                         }
6400                     }else{
6401                         /* p0', q0' */
6402                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6403                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6404                     }
6405                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6406                 }
6407                 pix++;
6408             }
6409     }
6410 }
6411
6412 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6413     int i;
6414     const int index_a = qp + h->slice_alpha_c0_offset;
6415     const int alpha = (alpha_table+52)[index_a];
6416     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6417
6418     if( bS[0] < 4 ) {
6419         int8_t tc[4];
6420         for(i=0; i<4; i++)
6421             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6422         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6423     } else {
6424         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6425     }
6426 }
6427
6428 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6429     MpegEncContext * const s = &h->s;
6430     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6431     int mb_xy, mb_type;
6432     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6433
6434     mb_xy = h->mb_xy;
6435
6436     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6437 1 ||
6438        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6439                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6440         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6441         return;
6442     }
6443     assert(!FRAME_MBAFF);
6444
6445     mb_type = s->current_picture.mb_type[mb_xy];
6446     qp = s->current_picture.qscale_table[mb_xy];
6447     qp0 = s->current_picture.qscale_table[mb_xy-1];
6448     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6449     qpc = get_chroma_qp( h, 0, qp );
6450     qpc0 = get_chroma_qp( h, 0, qp0 );
6451     qpc1 = get_chroma_qp( h, 0, qp1 );
6452     qp0 = (qp + qp0 + 1) >> 1;
6453     qp1 = (qp + qp1 + 1) >> 1;
6454     qpc0 = (qpc + qpc0 + 1) >> 1;
6455     qpc1 = (qpc + qpc1 + 1) >> 1;
6456     qp_thresh = 15 - h->slice_alpha_c0_offset;
6457     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6458        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6459         return;
6460
6461     if( IS_INTRA(mb_type) ) {
6462         int16_t bS4[4] = {4,4,4,4};
6463         int16_t bS3[4] = {3,3,3,3};
6464         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6465         if( IS_8x8DCT(mb_type) ) {
6466             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6467             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6468             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6469             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6470         } else {
6471             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6472             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6473             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6474             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6475             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6476             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6477             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6478             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6479         }
6480         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6481         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6482         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6483         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6484         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6485         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6486         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6487         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6488         return;
6489     } else {
6490         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6491         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6492         int edges;
6493         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6494             edges = 4;
6495             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6496         } else {
6497             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6498                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6499             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6500                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6501                              ? 3 : 0;
6502             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6503             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6504             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6505                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6506         }
6507         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6508             bSv[0][0] = 0x0004000400040004ULL;
6509         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6510             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6511
6512 #define FILTER(hv,dir,edge)\
6513         if(bSv[dir][edge]) {\
6514             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6515             if(!(edge&1)) {\
6516                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6517                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6518             }\
6519         }
6520         if( edges == 1 ) {
6521             FILTER(v,0,0);
6522             FILTER(h,1,0);
6523         } else if( IS_8x8DCT(mb_type) ) {
6524             FILTER(v,0,0);
6525             FILTER(v,0,2);
6526             FILTER(h,1,0);
6527             FILTER(h,1,2);
6528         } else {
6529             FILTER(v,0,0);
6530             FILTER(v,0,1);
6531             FILTER(v,0,2);
6532             FILTER(v,0,3);
6533             FILTER(h,1,0);
6534             FILTER(h,1,1);
6535             FILTER(h,1,2);
6536             FILTER(h,1,3);
6537         }
6538 #undef FILTER
6539     }
6540 }
6541
6542 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6543     MpegEncContext * const s = &h->s;
6544     const int mb_xy= mb_x + mb_y*s->mb_stride;
6545     const int mb_type = s->current_picture.mb_type[mb_xy];
6546     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6547     int first_vertical_edge_done = 0;
6548     int dir;
6549
6550     //for sufficiently low qp, filtering wouldn't do anything
6551     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6552     if(!FRAME_MBAFF){
6553         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6554         int qp = s->current_picture.qscale_table[mb_xy];
6555         if(qp <= qp_thresh
6556            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6557            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6558             return;
6559         }
6560     }
6561
6562     if (FRAME_MBAFF
6563             // left mb is in picture
6564             && h->slice_table[mb_xy-1] != 255
6565             // and current and left pair do not have the same interlaced type
6566             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6567             // and left mb is in the same slice if deblocking_filter == 2
6568             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6569         /* First vertical edge is different in MBAFF frames
6570          * There are 8 different bS to compute and 2 different Qp
6571          */
6572         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6573         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6574         int16_t bS[8];
6575         int qp[2];
6576         int bqp[2];
6577         int rqp[2];
6578         int mb_qp, mbn0_qp, mbn1_qp;
6579         int i;
6580         first_vertical_edge_done = 1;
6581
6582         if( IS_INTRA(mb_type) )
6583             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6584         else {
6585             for( i = 0; i < 8; i++ ) {
6586                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6587
6588                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6589                     bS[i] = 4;
6590                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6591                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6592                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6593                     bS[i] = 2;
6594                 else
6595                     bS[i] = 1;
6596             }
6597         }
6598
6599         mb_qp = s->current_picture.qscale_table[mb_xy];
6600         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6601         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6602         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6603         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6604                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6605         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6606                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6607         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6608         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6609                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6610         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6611                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6612
6613         /* Filter edge */
6614         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6615         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6616         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6617         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6618         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6619     }
6620     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6621     for( dir = 0; dir < 2; dir++ )
6622     {
6623         int edge;
6624         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6625         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6626         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6627         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6628         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6629
6630         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6631                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6632         // how often to recheck mv-based bS when iterating between edges
6633         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6634                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6635         // how often to recheck mv-based bS when iterating along each edge
6636         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6637
6638         if (first_vertical_edge_done) {
6639             start = 1;
6640             first_vertical_edge_done = 0;
6641         }
6642
6643         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6644             start = 1;
6645
6646         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6647             && !IS_INTERLACED(mb_type)
6648             && IS_INTERLACED(mbm_type)
6649             ) {
6650             // This is a special case in the norm where the filtering must
6651             // be done twice (one each of the field) even if we are in a
6652             // frame macroblock.
6653             //
6654             static const int nnz_idx[4] = {4,5,6,3};
6655             unsigned int tmp_linesize   = 2 *   linesize;
6656             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6657             int mbn_xy = mb_xy - 2 * s->mb_stride;
6658             int qp;
6659             int i, j;
6660             int16_t bS[4];
6661
6662             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6663                 if( IS_INTRA(mb_type) ||
6664                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6665                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6666                 } else {
6667                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6668                     for( i = 0; i < 4; i++ ) {
6669                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6670                             mbn_nnz[nnz_idx[i]] != 0 )
6671                             bS[i] = 2;
6672                         else
6673                             bS[i] = 1;
6674                     }
6675                 }
6676                 // Do not use s->qscale as luma quantizer because it has not the same
6677                 // value in IPCM macroblocks.
6678                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6679                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6680                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6681                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6682                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6683                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6684                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6685                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6686             }
6687
6688             start = 1;
6689         }
6690
6691         /* Calculate bS */
6692         for( edge = start; edge < edges; edge++ ) {
6693             /* mbn_xy: neighbor macroblock */
6694             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6695             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6696             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6697             int16_t bS[4];
6698             int qp;
6699
6700             if( (edge&1) && IS_8x8DCT(mb_type) )
6701                 continue;
6702
6703             if( IS_INTRA(mb_type) ||
6704                 IS_INTRA(mbn_type) ) {
6705                 int value;
6706                 if (edge == 0) {
6707                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6708                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6709                     ) {
6710                         value = 4;
6711                     } else {
6712                         value = 3;
6713                     }
6714                 } else {
6715                     value = 3;
6716                 }
6717                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6718             } else {
6719                 int i, l;
6720                 int mv_done;
6721
6722                 if( edge & mask_edge ) {
6723                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6724                     mv_done = 1;
6725                 }
6726                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6727                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6728                     mv_done = 1;
6729                 }
6730                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6731                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6732                     int bn_idx= b_idx - (dir ? 8:1);
6733                     int v = 0;
6734                     int xn= h->slice_type_nos == FF_B_TYPE && ref2frm[0][h->ref_cache[0][b_idx]+2] != ref2frmn[0][h->ref_cache[0][bn_idx]+2];
6735
6736                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6737                         int ln= l^xn;
6738                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6739                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6740                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6741                     }
6742                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6743                     mv_done = 1;
6744                 }
6745                 else
6746                     mv_done = 0;
6747
6748                 for( i = 0; i < 4; i++ ) {
6749                     int x = dir == 0 ? edge : i;
6750                     int y = dir == 0 ? i    : edge;
6751                     int b_idx= 8 + 4 + x + 8*y;
6752                     int bn_idx= b_idx - (dir ? 8:1);
6753
6754                     if( h->non_zero_count_cache[b_idx] != 0 ||
6755                         h->non_zero_count_cache[bn_idx] != 0 ) {
6756                         bS[i] = 2;
6757                     }
6758                     else if(!mv_done)
6759                     {
6760                         int xn= h->slice_type_nos == FF_B_TYPE && ref2frm[0][h->ref_cache[0][b_idx]+2] != ref2frmn[0][h->ref_cache[0][bn_idx]+2];
6761                         bS[i] = 0;
6762                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6763                             int ln= l^xn;
6764                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6765                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6766                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6767                                 bS[i] = 1;
6768                                 break;
6769                             }
6770                         }
6771                     }
6772                 }
6773
6774                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6775                     continue;
6776             }
6777
6778             /* Filter edge */
6779             // Do not use s->qscale as luma quantizer because it has not the same
6780             // value in IPCM macroblocks.
6781             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6782             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6783             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6784             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6785             if( dir == 0 ) {
6786                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6787                 if( (edge&1) == 0 ) {
6788                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6789                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6790                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6791                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6792                 }
6793             } else {
6794                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6795                 if( (edge&1) == 0 ) {
6796                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6797                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6798                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6799                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6800                 }
6801             }
6802         }
6803     }
6804 }
6805
6806 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6807     MpegEncContext * const s = &h->s;
6808     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6809
6810     s->mb_skip_run= -1;
6811
6812     if( h->pps.cabac ) {
6813         int i;
6814
6815         /* realign */
6816         align_get_bits( &s->gb );
6817
6818         /* init cabac */
6819         ff_init_cabac_states( &h->cabac);
6820         ff_init_cabac_decoder( &h->cabac,
6821                                s->gb.buffer + get_bits_count(&s->gb)/8,
6822                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6823         /* calculate pre-state */
6824         for( i= 0; i < 460; i++ ) {
6825             int pre;
6826             if( h->slice_type_nos == FF_I_TYPE )
6827                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6828             else
6829                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6830
6831             if( pre <= 63 )
6832                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6833             else
6834                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6835         }
6836
6837         for(;;){
6838 //START_TIMER
6839             int ret = decode_mb_cabac(h);
6840             int eos;
6841 //STOP_TIMER("decode_mb_cabac")
6842
6843             if(ret>=0) hl_decode_mb(h);
6844
6845             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6846                 s->mb_y++;
6847
6848                 if(ret>=0) ret = decode_mb_cabac(h);
6849
6850                 if(ret>=0) hl_decode_mb(h);
6851                 s->mb_y--;
6852             }
6853             eos = get_cabac_terminate( &h->cabac );
6854
6855             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6856                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6857                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6858                 return -1;
6859             }
6860
6861             if( ++s->mb_x >= s->mb_width ) {
6862                 s->mb_x = 0;
6863                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6864                 ++s->mb_y;
6865                 if(FIELD_OR_MBAFF_PICTURE) {
6866                     ++s->mb_y;
6867                 }
6868             }
6869
6870             if( eos || s->mb_y >= s->mb_height ) {
6871                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6872                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6873                 return 0;
6874             }
6875         }
6876
6877     } else {
6878         for(;;){
6879             int ret = decode_mb_cavlc(h);
6880
6881             if(ret>=0) hl_decode_mb(h);
6882
6883             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6884                 s->mb_y++;
6885                 ret = decode_mb_cavlc(h);
6886
6887                 if(ret>=0) hl_decode_mb(h);
6888                 s->mb_y--;
6889             }
6890
6891             if(ret<0){
6892                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6893                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6894
6895                 return -1;
6896             }
6897
6898             if(++s->mb_x >= s->mb_width){
6899                 s->mb_x=0;
6900                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6901                 ++s->mb_y;
6902                 if(FIELD_OR_MBAFF_PICTURE) {
6903                     ++s->mb_y;
6904                 }
6905                 if(s->mb_y >= s->mb_height){
6906                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6907
6908                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6909                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6910
6911                         return 0;
6912                     }else{
6913                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6914
6915                         return -1;
6916                     }
6917                 }
6918             }
6919
6920             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6921                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6922                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6923                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6924
6925                     return 0;
6926                 }else{
6927                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6928
6929                     return -1;
6930                 }
6931             }
6932         }
6933     }
6934
6935 #if 0
6936     for(;s->mb_y < s->mb_height; s->mb_y++){
6937         for(;s->mb_x < s->mb_width; s->mb_x++){
6938             int ret= decode_mb(h);
6939
6940             hl_decode_mb(h);
6941
6942             if(ret<0){
6943                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6944                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6945
6946                 return -1;
6947             }
6948
6949             if(++s->mb_x >= s->mb_width){
6950                 s->mb_x=0;
6951                 if(++s->mb_y >= s->mb_height){
6952                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6953                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6954
6955                         return 0;
6956                     }else{
6957                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6958
6959                         return -1;
6960                     }
6961                 }
6962             }
6963
6964             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6965                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6966                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6967
6968                     return 0;
6969                 }else{
6970                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6971
6972                     return -1;
6973                 }
6974             }
6975         }
6976         s->mb_x=0;
6977         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6978     }
6979 #endif
6980     return -1; //not reached
6981 }
6982
6983 static int decode_unregistered_user_data(H264Context *h, int size){
6984     MpegEncContext * const s = &h->s;
6985     uint8_t user_data[16+256];
6986     int e, build, i;
6987
6988     if(size<16)
6989         return -1;
6990
6991     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6992         user_data[i]= get_bits(&s->gb, 8);
6993     }
6994
6995     user_data[i]= 0;
6996     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6997     if(e==1 && build>=0)
6998         h->x264_build= build;
6999
7000     if(s->avctx->debug & FF_DEBUG_BUGS)
7001         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7002
7003     for(; i<size; i++)
7004         skip_bits(&s->gb, 8);
7005
7006     return 0;
7007 }
7008
7009 static int decode_sei(H264Context *h){
7010     MpegEncContext * const s = &h->s;
7011
7012     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7013         int size, type;
7014
7015         type=0;
7016         do{
7017             type+= show_bits(&s->gb, 8);
7018         }while(get_bits(&s->gb, 8) == 255);
7019
7020         size=0;
7021         do{
7022             size+= show_bits(&s->gb, 8);
7023         }while(get_bits(&s->gb, 8) == 255);
7024
7025         switch(type){
7026         case 5:
7027             if(decode_unregistered_user_data(h, size) < 0)
7028                 return -1;
7029             break;
7030         default:
7031             skip_bits(&s->gb, 8*size);
7032         }
7033
7034         //FIXME check bits here
7035         align_get_bits(&s->gb);
7036     }
7037
7038     return 0;
7039 }
7040
7041 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7042     MpegEncContext * const s = &h->s;
7043     int cpb_count, i;
7044     cpb_count = get_ue_golomb(&s->gb) + 1;
7045     get_bits(&s->gb, 4); /* bit_rate_scale */
7046     get_bits(&s->gb, 4); /* cpb_size_scale */
7047     for(i=0; i<cpb_count; i++){
7048         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7049         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7050         get_bits1(&s->gb);     /* cbr_flag */
7051     }
7052     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7053     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7054     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7055     get_bits(&s->gb, 5); /* time_offset_length */
7056 }
7057
7058 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7059     MpegEncContext * const s = &h->s;
7060     int aspect_ratio_info_present_flag;
7061     unsigned int aspect_ratio_idc;
7062     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7063
7064     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7065
7066     if( aspect_ratio_info_present_flag ) {
7067         aspect_ratio_idc= get_bits(&s->gb, 8);
7068         if( aspect_ratio_idc == EXTENDED_SAR ) {
7069             sps->sar.num= get_bits(&s->gb, 16);
7070             sps->sar.den= get_bits(&s->gb, 16);
7071         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7072             sps->sar=  pixel_aspect[aspect_ratio_idc];
7073         }else{
7074             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7075             return -1;
7076         }
7077     }else{
7078         sps->sar.num=
7079         sps->sar.den= 0;
7080     }
7081 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7082
7083     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7084         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7085     }
7086
7087     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7088         get_bits(&s->gb, 3);    /* video_format */
7089         get_bits1(&s->gb);      /* video_full_range_flag */
7090         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7091             get_bits(&s->gb, 8); /* colour_primaries */
7092             get_bits(&s->gb, 8); /* transfer_characteristics */
7093             get_bits(&s->gb, 8); /* matrix_coefficients */
7094         }
7095     }
7096
7097     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7098         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7099         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7100     }
7101
7102     sps->timing_info_present_flag = get_bits1(&s->gb);
7103     if(sps->timing_info_present_flag){
7104         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7105         sps->time_scale = get_bits_long(&s->gb, 32);
7106         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7107     }
7108
7109     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7110     if(nal_hrd_parameters_present_flag)
7111         decode_hrd_parameters(h, sps);
7112     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7113     if(vcl_hrd_parameters_present_flag)
7114         decode_hrd_parameters(h, sps);
7115     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7116         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7117     get_bits1(&s->gb);         /* pic_struct_present_flag */
7118
7119     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7120     if(sps->bitstream_restriction_flag){
7121         unsigned int num_reorder_frames;
7122         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7123         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7124         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7125         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7126         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7127         num_reorder_frames= get_ue_golomb(&s->gb);
7128         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7129
7130         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7131             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7132             return -1;
7133         }
7134
7135         sps->num_reorder_frames= num_reorder_frames;
7136     }
7137
7138     return 0;
7139 }
7140
7141 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7142                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7143     MpegEncContext * const s = &h->s;
7144     int i, last = 8, next = 8;
7145     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7146     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7147         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7148     else
7149     for(i=0;i<size;i++){
7150         if(next)
7151             next = (last + get_se_golomb(&s->gb)) & 0xff;
7152         if(!i && !next){ /* matrix not written, we use the preset one */
7153             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7154             break;
7155         }
7156         last = factors[scan[i]] = next ? next : last;
7157     }
7158 }
7159
7160 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7161                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7162     MpegEncContext * const s = &h->s;
7163     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7164     const uint8_t *fallback[4] = {
7165         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7166         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7167         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7168         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7169     };
7170     if(get_bits1(&s->gb)){
7171         sps->scaling_matrix_present |= is_sps;
7172         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7173         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7174         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7175         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7176         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7177         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7178         if(is_sps || pps->transform_8x8_mode){
7179             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7180             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7181         }
7182     } else if(fallback_sps) {
7183         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7184         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7185     }
7186 }
7187
7188 /**
7189  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7190  */
7191 static void *
7192 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7193                     const size_t size, const char *name)
7194 {
7195     if(id>=max) {
7196         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7197         return NULL;
7198     }
7199
7200     if(!vec[id]) {
7201         vec[id] = av_mallocz(size);
7202         if(vec[id] == NULL)
7203             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7204     }
7205     return vec[id];
7206 }
7207
7208 static inline int decode_seq_parameter_set(H264Context *h){
7209     MpegEncContext * const s = &h->s;
7210     int profile_idc, level_idc;
7211     unsigned int sps_id, tmp, mb_width, mb_height;
7212     int i;
7213     SPS *sps;
7214
7215     profile_idc= get_bits(&s->gb, 8);
7216     get_bits1(&s->gb);   //constraint_set0_flag
7217     get_bits1(&s->gb);   //constraint_set1_flag
7218     get_bits1(&s->gb);   //constraint_set2_flag
7219     get_bits1(&s->gb);   //constraint_set3_flag
7220     get_bits(&s->gb, 4); // reserved
7221     level_idc= get_bits(&s->gb, 8);
7222     sps_id= get_ue_golomb(&s->gb);
7223
7224     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7225     if(sps == NULL)
7226         return -1;
7227
7228     sps->profile_idc= profile_idc;
7229     sps->level_idc= level_idc;
7230
7231     if(sps->profile_idc >= 100){ //high profile
7232         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7233             get_bits1(&s->gb);  //residual_color_transform_flag
7234         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7235         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7236         sps->transform_bypass = get_bits1(&s->gb);
7237         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7238     }else
7239         sps->scaling_matrix_present = 0;
7240
7241     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7242     sps->poc_type= get_ue_golomb(&s->gb);
7243
7244     if(sps->poc_type == 0){ //FIXME #define
7245         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7246     } else if(sps->poc_type == 1){//FIXME #define
7247         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7248         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7249         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7250         tmp= get_ue_golomb(&s->gb);
7251
7252         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7253             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7254             return -1;
7255         }
7256         sps->poc_cycle_length= tmp;
7257
7258         for(i=0; i<sps->poc_cycle_length; i++)
7259             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7260     }else if(sps->poc_type != 2){
7261         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7262         return -1;
7263     }
7264
7265     tmp= get_ue_golomb(&s->gb);
7266     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7267         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7268         return -1;
7269     }
7270     sps->ref_frame_count= tmp;
7271     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7272     mb_width= get_ue_golomb(&s->gb) + 1;
7273     mb_height= get_ue_golomb(&s->gb) + 1;
7274     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7275        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7276         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7277         return -1;
7278     }
7279     sps->mb_width = mb_width;
7280     sps->mb_height= mb_height;
7281
7282     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7283     if(!sps->frame_mbs_only_flag)
7284         sps->mb_aff= get_bits1(&s->gb);
7285     else
7286         sps->mb_aff= 0;
7287
7288     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7289
7290 #ifndef ALLOW_INTERLACE
7291     if(sps->mb_aff)
7292         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7293 #endif
7294     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7295         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7296
7297     sps->crop= get_bits1(&s->gb);
7298     if(sps->crop){
7299         sps->crop_left  = get_ue_golomb(&s->gb);
7300         sps->crop_right = get_ue_golomb(&s->gb);
7301         sps->crop_top   = get_ue_golomb(&s->gb);
7302         sps->crop_bottom= get_ue_golomb(&s->gb);
7303         if(sps->crop_left || sps->crop_top){
7304             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7305         }
7306         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7307             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7308         }
7309     }else{
7310         sps->crop_left  =
7311         sps->crop_right =
7312         sps->crop_top   =
7313         sps->crop_bottom= 0;
7314     }
7315
7316     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7317     if( sps->vui_parameters_present_flag )
7318         decode_vui_parameters(h, sps);
7319
7320     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7321         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7322                sps_id, sps->profile_idc, sps->level_idc,
7323                sps->poc_type,
7324                sps->ref_frame_count,
7325                sps->mb_width, sps->mb_height,
7326                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7327                sps->direct_8x8_inference_flag ? "8B8" : "",
7328                sps->crop_left, sps->crop_right,
7329                sps->crop_top, sps->crop_bottom,
7330                sps->vui_parameters_present_flag ? "VUI" : ""
7331                );
7332     }
7333     return 0;
7334 }
7335
7336 static void
7337 build_qp_table(PPS *pps, int t, int index)
7338 {
7339     int i;
7340     for(i = 0; i < 255; i++)
7341         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7342 }
7343
7344 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7345     MpegEncContext * const s = &h->s;
7346     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7347     PPS *pps;
7348
7349     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7350     if(pps == NULL)
7351         return -1;
7352
7353     tmp= get_ue_golomb(&s->gb);
7354     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7355         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7356         return -1;
7357     }
7358     pps->sps_id= tmp;
7359
7360     pps->cabac= get_bits1(&s->gb);
7361     pps->pic_order_present= get_bits1(&s->gb);
7362     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7363     if(pps->slice_group_count > 1 ){
7364         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7365         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7366         switch(pps->mb_slice_group_map_type){
7367         case 0:
7368 #if 0
7369 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7370 |    run_length[ i ]                                |1  |ue(v)   |
7371 #endif
7372             break;
7373         case 2:
7374 #if 0
7375 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7376 |{                                                  |   |        |
7377 |    top_left_mb[ i ]                               |1  |ue(v)   |
7378 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7379 |   }                                               |   |        |
7380 #endif
7381             break;
7382         case 3:
7383         case 4:
7384         case 5:
7385 #if 0
7386 |   slice_group_change_direction_flag               |1  |u(1)    |
7387 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7388 #endif
7389             break;
7390         case 6:
7391 #if 0
7392 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7393 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7394 |)                                                  |   |        |
7395 |    slice_group_id[ i ]                            |1  |u(v)    |
7396 #endif
7397             break;
7398         }
7399     }
7400     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7401     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7402     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7403         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7404         pps->ref_count[0]= pps->ref_count[1]= 1;
7405         return -1;
7406     }
7407
7408     pps->weighted_pred= get_bits1(&s->gb);
7409     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7410     pps->init_qp= get_se_golomb(&s->gb) + 26;
7411     pps->init_qs= get_se_golomb(&s->gb) + 26;
7412     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7413     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7414     pps->constrained_intra_pred= get_bits1(&s->gb);
7415     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7416
7417     pps->transform_8x8_mode= 0;
7418     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7419     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7420     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7421
7422     if(get_bits_count(&s->gb) < bit_length){
7423         pps->transform_8x8_mode= get_bits1(&s->gb);
7424         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7425         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7426     } else {
7427         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7428     }
7429
7430     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7431     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7432     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7433         h->pps.chroma_qp_diff= 1;
7434
7435     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7436         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7437                pps_id, pps->sps_id,
7438                pps->cabac ? "CABAC" : "CAVLC",
7439                pps->slice_group_count,
7440                pps->ref_count[0], pps->ref_count[1],
7441                pps->weighted_pred ? "weighted" : "",
7442                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7443                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7444                pps->constrained_intra_pred ? "CONSTR" : "",
7445                pps->redundant_pic_cnt_present ? "REDU" : "",
7446                pps->transform_8x8_mode ? "8x8DCT" : ""
7447                );
7448     }
7449
7450     return 0;
7451 }
7452
7453 /**
7454  * Call decode_slice() for each context.
7455  *
7456  * @param h h264 master context
7457  * @param context_count number of contexts to execute
7458  */
7459 static void execute_decode_slices(H264Context *h, int context_count){
7460     MpegEncContext * const s = &h->s;
7461     AVCodecContext * const avctx= s->avctx;
7462     H264Context *hx;
7463     int i;
7464
7465     if(context_count == 1) {
7466         decode_slice(avctx, h);
7467     } else {
7468         for(i = 1; i < context_count; i++) {
7469             hx = h->thread_context[i];
7470             hx->s.error_resilience = avctx->error_resilience;
7471             hx->s.error_count = 0;
7472         }
7473
7474         avctx->execute(avctx, (void *)decode_slice,
7475                        (void **)h->thread_context, NULL, context_count);
7476
7477         /* pull back stuff from slices to master context */
7478         hx = h->thread_context[context_count - 1];
7479         s->mb_x = hx->s.mb_x;
7480         s->mb_y = hx->s.mb_y;
7481         s->dropable = hx->s.dropable;
7482         s->picture_structure = hx->s.picture_structure;
7483         for(i = 1; i < context_count; i++)
7484             h->s.error_count += h->thread_context[i]->s.error_count;
7485     }
7486 }
7487
7488
7489 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7490     MpegEncContext * const s = &h->s;
7491     AVCodecContext * const avctx= s->avctx;
7492     int buf_index=0;
7493     H264Context *hx; ///< thread context
7494     int context_count = 0;
7495
7496     h->max_contexts = avctx->thread_count;
7497 #if 0
7498     int i;
7499     for(i=0; i<50; i++){
7500         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7501     }
7502 #endif
7503     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7504         h->current_slice = 0;
7505         if (!s->first_field)
7506             s->current_picture_ptr= NULL;
7507     }
7508
7509     for(;;){
7510         int consumed;
7511         int dst_length;
7512         int bit_length;
7513         const uint8_t *ptr;
7514         int i, nalsize = 0;
7515         int err;
7516
7517         if(h->is_avc) {
7518             if(buf_index >= buf_size) break;
7519             nalsize = 0;
7520             for(i = 0; i < h->nal_length_size; i++)
7521                 nalsize = (nalsize << 8) | buf[buf_index++];
7522             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7523                 if(nalsize == 1){
7524                     buf_index++;
7525                     continue;
7526                 }else{
7527                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7528                     break;
7529                 }
7530             }
7531         } else {
7532             // start code prefix search
7533             for(; buf_index + 3 < buf_size; buf_index++){
7534                 // This should always succeed in the first iteration.
7535                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7536                     break;
7537             }
7538
7539             if(buf_index+3 >= buf_size) break;
7540
7541             buf_index+=3;
7542         }
7543
7544         hx = h->thread_context[context_count];
7545
7546         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7547         if (ptr==NULL || dst_length < 0){
7548             return -1;
7549         }
7550         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7551             dst_length--;
7552         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7553
7554         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7555             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7556         }
7557
7558         if (h->is_avc && (nalsize != consumed)){
7559             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7560             consumed= nalsize;
7561         }
7562
7563         buf_index += consumed;
7564
7565         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7566            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7567             continue;
7568
7569       again:
7570         err = 0;
7571         switch(hx->nal_unit_type){
7572         case NAL_IDR_SLICE:
7573             if (h->nal_unit_type != NAL_IDR_SLICE) {
7574                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7575                 return -1;
7576             }
7577             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7578         case NAL_SLICE:
7579             init_get_bits(&hx->s.gb, ptr, bit_length);
7580             hx->intra_gb_ptr=
7581             hx->inter_gb_ptr= &hx->s.gb;
7582             hx->s.data_partitioning = 0;
7583
7584             if((err = decode_slice_header(hx, h)))
7585                break;
7586
7587             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7588             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7589                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7590                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7591                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7592                && avctx->skip_frame < AVDISCARD_ALL)
7593                 context_count++;
7594             break;
7595         case NAL_DPA:
7596             init_get_bits(&hx->s.gb, ptr, bit_length);
7597             hx->intra_gb_ptr=
7598             hx->inter_gb_ptr= NULL;
7599             hx->s.data_partitioning = 1;
7600
7601             err = decode_slice_header(hx, h);
7602             break;
7603         case NAL_DPB:
7604             init_get_bits(&hx->intra_gb, ptr, bit_length);
7605             hx->intra_gb_ptr= &hx->intra_gb;
7606             break;
7607         case NAL_DPC:
7608             init_get_bits(&hx->inter_gb, ptr, bit_length);
7609             hx->inter_gb_ptr= &hx->inter_gb;
7610
7611             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7612                && s->context_initialized
7613                && s->hurry_up < 5
7614                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7615                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7616                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7617                && avctx->skip_frame < AVDISCARD_ALL)
7618                 context_count++;
7619             break;
7620         case NAL_SEI:
7621             init_get_bits(&s->gb, ptr, bit_length);
7622             decode_sei(h);
7623             break;
7624         case NAL_SPS:
7625             init_get_bits(&s->gb, ptr, bit_length);
7626             decode_seq_parameter_set(h);
7627
7628             if(s->flags& CODEC_FLAG_LOW_DELAY)
7629                 s->low_delay=1;
7630
7631             if(avctx->has_b_frames < 2)
7632                 avctx->has_b_frames= !s->low_delay;
7633             break;
7634         case NAL_PPS:
7635             init_get_bits(&s->gb, ptr, bit_length);
7636
7637             decode_picture_parameter_set(h, bit_length);
7638
7639             break;
7640         case NAL_AUD:
7641         case NAL_END_SEQUENCE:
7642         case NAL_END_STREAM:
7643         case NAL_FILLER_DATA:
7644         case NAL_SPS_EXT:
7645         case NAL_AUXILIARY_SLICE:
7646             break;
7647         default:
7648             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7649         }
7650
7651         if(context_count == h->max_contexts) {
7652             execute_decode_slices(h, context_count);
7653             context_count = 0;
7654         }
7655
7656         if (err < 0)
7657             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7658         else if(err == 1) {
7659             /* Slice could not be decoded in parallel mode, copy down
7660              * NAL unit stuff to context 0 and restart. Note that
7661              * rbsp_buffer is not transfered, but since we no longer
7662              * run in parallel mode this should not be an issue. */
7663             h->nal_unit_type = hx->nal_unit_type;
7664             h->nal_ref_idc   = hx->nal_ref_idc;
7665             hx = h;
7666             goto again;
7667         }
7668     }
7669     if(context_count)
7670         execute_decode_slices(h, context_count);
7671     return buf_index;
7672 }
7673
7674 /**
7675  * returns the number of bytes consumed for building the current frame
7676  */
7677 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7678     if(s->flags&CODEC_FLAG_TRUNCATED){
7679         pos -= s->parse_context.last_index;
7680         if(pos<0) pos=0; // FIXME remove (unneeded?)
7681
7682         return pos;
7683     }else{
7684         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7685         if(pos+10>buf_size) pos=buf_size; // oops ;)
7686
7687         return pos;
7688     }
7689 }
7690
7691 static int decode_frame(AVCodecContext *avctx,
7692                              void *data, int *data_size,
7693                              const uint8_t *buf, int buf_size)
7694 {
7695     H264Context *h = avctx->priv_data;
7696     MpegEncContext *s = &h->s;
7697     AVFrame *pict = data;
7698     int buf_index;
7699
7700     s->flags= avctx->flags;
7701     s->flags2= avctx->flags2;
7702
7703     if(s->flags&CODEC_FLAG_TRUNCATED){
7704         const int next= ff_h264_find_frame_end(h, buf, buf_size);
7705         assert((buf_size > 0) || (next == END_NOT_FOUND));
7706
7707         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7708           return buf_size;
7709 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7710     }
7711
7712    /* no supplementary picture */
7713     if (buf_size == 0) {
7714         Picture *out;
7715         int i, out_idx;
7716
7717 //FIXME factorize this with the output code below
7718         out = h->delayed_pic[0];
7719         out_idx = 0;
7720         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7721             if(h->delayed_pic[i]->poc < out->poc){
7722                 out = h->delayed_pic[i];
7723                 out_idx = i;
7724             }
7725
7726         for(i=out_idx; h->delayed_pic[i]; i++)
7727             h->delayed_pic[i] = h->delayed_pic[i+1];
7728
7729         if(out){
7730             *data_size = sizeof(AVFrame);
7731             *pict= *(AVFrame*)out;
7732         }
7733
7734         return 0;
7735     }
7736
7737     if(h->is_avc && !h->got_avcC) {
7738         int i, cnt, nalsize;
7739         unsigned char *p = avctx->extradata;
7740         if(avctx->extradata_size < 7) {
7741             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7742             return -1;
7743         }
7744         if(*p != 1) {
7745             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7746             return -1;
7747         }
7748         /* sps and pps in the avcC always have length coded with 2 bytes,
7749            so put a fake nal_length_size = 2 while parsing them */
7750         h->nal_length_size = 2;
7751         // Decode sps from avcC
7752         cnt = *(p+5) & 0x1f; // Number of sps
7753         p += 6;
7754         for (i = 0; i < cnt; i++) {
7755             nalsize = AV_RB16(p) + 2;
7756             if(decode_nal_units(h, p, nalsize) < 0) {
7757                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7758                 return -1;
7759             }
7760             p += nalsize;
7761         }
7762         // Decode pps from avcC
7763         cnt = *(p++); // Number of pps
7764         for (i = 0; i < cnt; i++) {
7765             nalsize = AV_RB16(p) + 2;
7766             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7767                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7768                 return -1;
7769             }
7770             p += nalsize;
7771         }
7772         // Now store right nal length size, that will be use to parse all other nals
7773         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7774         // Do not reparse avcC
7775         h->got_avcC = 1;
7776     }
7777
7778     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7779         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7780             return -1;
7781     }
7782
7783     buf_index=decode_nal_units(h, buf, buf_size);
7784     if(buf_index < 0)
7785         return -1;
7786
7787     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7788         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7789         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7790         return -1;
7791     }
7792
7793     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7794         Picture *out = s->current_picture_ptr;
7795         Picture *cur = s->current_picture_ptr;
7796         int i, pics, cross_idr, out_of_order, out_idx;
7797
7798         s->mb_y= 0;
7799
7800         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7801         s->current_picture_ptr->pict_type= s->pict_type;
7802
7803         h->prev_frame_num_offset= h->frame_num_offset;
7804         h->prev_frame_num= h->frame_num;
7805         if(!s->dropable) {
7806             h->prev_poc_msb= h->poc_msb;
7807             h->prev_poc_lsb= h->poc_lsb;
7808             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7809         }
7810
7811         /*
7812          * FIXME: Error handling code does not seem to support interlaced
7813          * when slices span multiple rows
7814          * The ff_er_add_slice calls don't work right for bottom
7815          * fields; they cause massive erroneous error concealing
7816          * Error marking covers both fields (top and bottom).
7817          * This causes a mismatched s->error_count
7818          * and a bad error table. Further, the error count goes to
7819          * INT_MAX when called for bottom field, because mb_y is
7820          * past end by one (callers fault) and resync_mb_y != 0
7821          * causes problems for the first MB line, too.
7822          */
7823         if (!FIELD_PICTURE)
7824             ff_er_frame_end(s);
7825
7826         MPV_frame_end(s);
7827
7828         if (s->first_field) {
7829             /* Wait for second field. */
7830             *data_size = 0;
7831
7832         } else {
7833             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7834             /* Derive top_field_first from field pocs. */
7835             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7836
7837         //FIXME do something with unavailable reference frames
7838
7839             /* Sort B-frames into display order */
7840
7841             if(h->sps.bitstream_restriction_flag
7842                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7843                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7844                 s->low_delay = 0;
7845             }
7846
7847             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7848                && !h->sps.bitstream_restriction_flag){
7849                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7850                 s->low_delay= 0;
7851             }
7852
7853             pics = 0;
7854             while(h->delayed_pic[pics]) pics++;
7855
7856             assert(pics <= MAX_DELAYED_PIC_COUNT);
7857
7858             h->delayed_pic[pics++] = cur;
7859             if(cur->reference == 0)
7860                 cur->reference = DELAYED_PIC_REF;
7861
7862             cross_idr = 0;
7863             for(i=0; h->delayed_pic[i]; i++)
7864                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7865                     cross_idr = 1;
7866
7867             out = h->delayed_pic[0];
7868             out_idx = 0;
7869             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7870                 if(h->delayed_pic[i]->poc < out->poc){
7871                     out = h->delayed_pic[i];
7872                     out_idx = i;
7873                 }
7874
7875             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7876
7877             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7878                 { }
7879             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7880                || (s->low_delay &&
7881                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7882                  || cur->pict_type == FF_B_TYPE)))
7883             {
7884                 s->low_delay = 0;
7885                 s->avctx->has_b_frames++;
7886             }
7887
7888             if(out_of_order || pics > s->avctx->has_b_frames){
7889                 out->reference &= ~DELAYED_PIC_REF;
7890                 for(i=out_idx; h->delayed_pic[i]; i++)
7891                     h->delayed_pic[i] = h->delayed_pic[i+1];
7892             }
7893             if(!out_of_order && pics > s->avctx->has_b_frames){
7894                 *data_size = sizeof(AVFrame);
7895
7896                 h->outputed_poc = out->poc;
7897                 *pict= *(AVFrame*)out;
7898             }else{
7899                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7900             }
7901         }
7902     }
7903
7904     assert(pict->data[0] || !*data_size);
7905     ff_print_debug_info(s, pict);
7906 //printf("out %d\n", (int)pict->data[0]);
7907 #if 0 //?
7908
7909     /* Return the Picture timestamp as the frame number */
7910     /* we subtract 1 because it is added on utils.c     */
7911     avctx->frame_number = s->picture_number - 1;
7912 #endif
7913     return get_consumed_bytes(s, buf_index, buf_size);
7914 }
7915 #if 0
7916 static inline void fill_mb_avail(H264Context *h){
7917     MpegEncContext * const s = &h->s;
7918     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7919
7920     if(s->mb_y){
7921         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7922         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7923         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7924     }else{
7925         h->mb_avail[0]=
7926         h->mb_avail[1]=
7927         h->mb_avail[2]= 0;
7928     }
7929     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7930     h->mb_avail[4]= 1; //FIXME move out
7931     h->mb_avail[5]= 0; //FIXME move out
7932 }
7933 #endif
7934
7935 #ifdef TEST
7936 #undef printf
7937 #undef random
7938 #define COUNT 8000
7939 #define SIZE (COUNT*40)
7940 int main(void){
7941     int i;
7942     uint8_t temp[SIZE];
7943     PutBitContext pb;
7944     GetBitContext gb;
7945 //    int int_temp[10000];
7946     DSPContext dsp;
7947     AVCodecContext avctx;
7948
7949     dsputil_init(&dsp, &avctx);
7950
7951     init_put_bits(&pb, temp, SIZE);
7952     printf("testing unsigned exp golomb\n");
7953     for(i=0; i<COUNT; i++){
7954         START_TIMER
7955         set_ue_golomb(&pb, i);
7956         STOP_TIMER("set_ue_golomb");
7957     }
7958     flush_put_bits(&pb);
7959
7960     init_get_bits(&gb, temp, 8*SIZE);
7961     for(i=0; i<COUNT; i++){
7962         int j, s;
7963
7964         s= show_bits(&gb, 24);
7965
7966         START_TIMER
7967         j= get_ue_golomb(&gb);
7968         if(j != i){
7969             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7970 //            return -1;
7971         }
7972         STOP_TIMER("get_ue_golomb");
7973     }
7974
7975
7976     init_put_bits(&pb, temp, SIZE);
7977     printf("testing signed exp golomb\n");
7978     for(i=0; i<COUNT; i++){
7979         START_TIMER
7980         set_se_golomb(&pb, i - COUNT/2);
7981         STOP_TIMER("set_se_golomb");
7982     }
7983     flush_put_bits(&pb);
7984
7985     init_get_bits(&gb, temp, 8*SIZE);
7986     for(i=0; i<COUNT; i++){
7987         int j, s;
7988
7989         s= show_bits(&gb, 24);
7990
7991         START_TIMER
7992         j= get_se_golomb(&gb);
7993         if(j != i - COUNT/2){
7994             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7995 //            return -1;
7996         }
7997         STOP_TIMER("get_se_golomb");
7998     }
7999
8000 #if 0
8001     printf("testing 4x4 (I)DCT\n");
8002
8003     DCTELEM block[16];
8004     uint8_t src[16], ref[16];
8005     uint64_t error= 0, max_error=0;
8006
8007     for(i=0; i<COUNT; i++){
8008         int j;
8009 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8010         for(j=0; j<16; j++){
8011             ref[j]= random()%255;
8012             src[j]= random()%255;
8013         }
8014
8015         h264_diff_dct_c(block, src, ref, 4);
8016
8017         //normalize
8018         for(j=0; j<16; j++){
8019 //            printf("%d ", block[j]);
8020             block[j]= block[j]*4;
8021             if(j&1) block[j]= (block[j]*4 + 2)/5;
8022             if(j&4) block[j]= (block[j]*4 + 2)/5;
8023         }
8024 //        printf("\n");
8025
8026         s->dsp.h264_idct_add(ref, block, 4);
8027 /*        for(j=0; j<16; j++){
8028             printf("%d ", ref[j]);
8029         }
8030         printf("\n");*/
8031
8032         for(j=0; j<16; j++){
8033             int diff= FFABS(src[j] - ref[j]);
8034
8035             error+= diff*diff;
8036             max_error= FFMAX(max_error, diff);
8037         }
8038     }
8039     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8040     printf("testing quantizer\n");
8041     for(qp=0; qp<52; qp++){
8042         for(i=0; i<16; i++)
8043             src1_block[i]= src2_block[i]= random()%255;
8044
8045     }
8046     printf("Testing NAL layer\n");
8047
8048     uint8_t bitstream[COUNT];
8049     uint8_t nal[COUNT*2];
8050     H264Context h;
8051     memset(&h, 0, sizeof(H264Context));
8052
8053     for(i=0; i<COUNT; i++){
8054         int zeros= i;
8055         int nal_length;
8056         int consumed;
8057         int out_length;
8058         uint8_t *out;
8059         int j;
8060
8061         for(j=0; j<COUNT; j++){
8062             bitstream[j]= (random() % 255) + 1;
8063         }
8064
8065         for(j=0; j<zeros; j++){
8066             int pos= random() % COUNT;
8067             while(bitstream[pos] == 0){
8068                 pos++;
8069                 pos %= COUNT;
8070             }
8071             bitstream[pos]=0;
8072         }
8073
8074         START_TIMER
8075
8076         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8077         if(nal_length<0){
8078             printf("encoding failed\n");
8079             return -1;
8080         }
8081
8082         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8083
8084         STOP_TIMER("NAL")
8085
8086         if(out_length != COUNT){
8087             printf("incorrect length %d %d\n", out_length, COUNT);
8088             return -1;
8089         }
8090
8091         if(consumed != nal_length){
8092             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8093             return -1;
8094         }
8095
8096         if(memcmp(bitstream, out, COUNT)){
8097             printf("mismatch\n");
8098             return -1;
8099         }
8100     }
8101 #endif
8102
8103     printf("Testing RBSP\n");
8104
8105
8106     return 0;
8107 }
8108 #endif /* TEST */
8109
8110
8111 static av_cold int decode_end(AVCodecContext *avctx)
8112 {
8113     H264Context *h = avctx->priv_data;
8114     MpegEncContext *s = &h->s;
8115
8116     av_freep(&h->rbsp_buffer[0]);
8117     av_freep(&h->rbsp_buffer[1]);
8118     free_tables(h); //FIXME cleanup init stuff perhaps
8119     MPV_common_end(s);
8120
8121 //    memset(h, 0, sizeof(H264Context));
8122
8123     return 0;
8124 }
8125
8126
8127 AVCodec h264_decoder = {
8128     "h264",
8129     CODEC_TYPE_VIDEO,
8130     CODEC_ID_H264,
8131     sizeof(H264Context),
8132     decode_init,
8133     NULL,
8134     decode_end,
8135     decode_frame,
8136     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8137     .flush= flush_dpb,
8138     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8139 };
8140
8141 #include "svq3.c"